summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
commitd0b2f91bede3bd5e3d24dd6803e56eee959c1797 (patch)
tree7fee4ab0509879c373c4f2cbd5b8a5be5b4041ee /net
parente914f8eb445e8f74b00303c19c2ffceaedd16a05 (diff)
Linux-libre 4.8.2-gnupck-4.8.2-gnu
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/6lowpan_i.h4
-rw-r--r--net/6lowpan/Makefile2
-rw-r--r--net/6lowpan/core.c50
-rw-r--r--net/6lowpan/debugfs.c39
-rw-r--r--net/6lowpan/iphc.c167
-rw-r--r--net/6lowpan/ndisc.c241
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/8021q/vlan_dev.c2
-rw-r--r--net/9p/trans_virtio.c4
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/atm/clip.c2
-rw-r--r--net/batman-adv/Kconfig2
-rw-r--r--net/batman-adv/Makefile5
-rw-r--r--net/batman-adv/bat_algo.c140
-rw-r--r--net/batman-adv/bat_algo.h32
-rw-r--r--net/batman-adv/bat_iv_ogm.c106
-rw-r--r--net/batman-adv/bat_iv_ogm.h25
-rw-r--r--net/batman-adv/bat_v.c58
-rw-r--r--net/batman-adv/bat_v.h52
-rw-r--r--net/batman-adv/bat_v_elp.c9
-rw-r--r--net/batman-adv/bat_v_elp.h4
-rw-r--r--net/batman-adv/bat_v_ogm.c9
-rw-r--r--net/batman-adv/bat_v_ogm.h4
-rw-r--r--net/batman-adv/bitarray.c2
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c1
-rw-r--r--net/batman-adv/debugfs.c240
-rw-r--r--net/batman-adv/distributed-arp-table.c2
-rw-r--r--net/batman-adv/fragmentation.c53
-rw-r--r--net/batman-adv/fragmentation.h6
-rw-r--r--net/batman-adv/gateway_client.c16
-rw-r--r--net/batman-adv/gateway_common.c10
-rw-r--r--net/batman-adv/hard-interface.c25
-rw-r--r--net/batman-adv/icmp_socket.c1
-rw-r--r--net/batman-adv/log.c231
-rw-r--r--net/batman-adv/log.h111
-rw-r--r--net/batman-adv/main.c709
-rw-r--r--net/batman-adv/main.h121
-rw-r--r--net/batman-adv/multicast.c501
-rw-r--r--net/batman-adv/multicast.h3
-rw-r--r--net/batman-adv/netlink.c424
-rw-r--r--net/batman-adv/netlink.h32
-rw-r--r--net/batman-adv/network-coding.c2
-rw-r--r--net/batman-adv/originator.c91
-rw-r--r--net/batman-adv/originator.h6
-rw-r--r--net/batman-adv/packet.h61
-rw-r--r--net/batman-adv/routing.c73
-rw-r--r--net/batman-adv/send.c102
-rw-r--r--net/batman-adv/send.h4
-rw-r--r--net/batman-adv/soft-interface.c13
-rw-r--r--net/batman-adv/sysfs.c29
-rw-r--r--net/batman-adv/tp_meter.c1507
-rw-r--r--net/batman-adv/tp_meter.h34
-rw-r--r--net/batman-adv/translation-table.c10
-rw-r--r--net/batman-adv/tvlv.c632
-rw-r--r--net/batman-adv/tvlv.h61
-rw-r--r--net/batman-adv/types.h258
-rw-r--r--net/bluetooth/6lowpan.c13
-rw-r--r--net/bluetooth/af_bluetooth.c5
-rw-r--r--net/bluetooth/hci_conn.c2
-rw-r--r--net/bluetooth/hci_core.c52
-rw-r--r--net/bluetooth/hci_debugfs.c35
-rw-r--r--net/bluetooth/hci_event.c18
-rw-r--r--net/bluetooth/hci_request.c2
-rw-r--r--net/bluetooth/hci_sock.c7
-rw-r--r--net/bluetooth/hci_sysfs.c99
-rw-r--r--net/bluetooth/l2cap_core.c10
-rw-r--r--net/bluetooth/l2cap_sock.c14
-rw-r--r--net/bluetooth/mgmt.c18
-rw-r--r--net/bluetooth/smp.c67
-rw-r--r--net/bridge/br_device.c34
-rw-r--r--net/bridge/br_fdb.c52
-rw-r--r--net/bridge/br_forward.c203
-rw-r--r--net/bridge/br_if.c9
-rw-r--r--net/bridge/br_input.c72
-rw-r--r--net/bridge/br_multicast.c243
-rw-r--r--net/bridge/br_netlink.c148
-rw-r--r--net/bridge/br_private.h73
-rw-r--r--net/bridge/br_stp.c2
-rw-r--r--net/bridge/br_stp_if.c2
-rw-r--r--net/bridge/br_sysfs_br.c25
-rw-r--r--net/bridge/netfilter/ebt_802_3.c6
-rw-r--r--net/bridge/netfilter/ebt_arp.c43
-rw-r--r--net/bridge/netfilter/ebt_ip.c28
-rw-r--r--net/bridge/netfilter/ebt_ip6.c41
-rw-r--r--net/bridge/netfilter/ebt_stp.c97
-rw-r--r--net/bridge/netfilter/ebtables.c34
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c1
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c8
-rw-r--r--net/caif/chnl_net.c1
-rw-r--r--net/can/Makefile3
-rw-r--r--net/can/af_can.c22
-rw-r--r--net/can/bcm.c309
-rw-r--r--net/can/proc.c3
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_fs.c30
-rw-r--r--net/ceph/debugfs.c12
-rw-r--r--net/ceph/mon_client.c6
-rw-r--r--net/ceph/msgpool.c1
-rw-r--r--net/ceph/osd_client.c49
-rw-r--r--net/ceph/osdmap.c58
-rw-r--r--net/ceph/string_table.c105
-rw-r--r--net/core/dev.c198
-rw-r--r--net/core/devlink.c91
-rw-r--r--net/core/drop_monitor.c3
-rw-r--r--net/core/ethtool.c1
-rw-r--r--net/core/fib_rules.c82
-rw-r--r--net/core/filter.c523
-rw-r--r--net/core/flow_dissector.c6
-rw-r--r--net/core/gen_estimator.c24
-rw-r--r--net/core/gen_stats.c35
-rw-r--r--net/core/neighbour.c13
-rw-r--r--net/core/net-sysfs.c15
-rw-r--r--net/core/netpoll.c2
-rw-r--r--net/core/pktgen.c43
-rw-r--r--net/core/rtnetlink.c156
-rw-r--r--net/core/skbuff.c46
-rw-r--r--net/core/utils.c8
-rw-r--r--net/dccp/ipv6.c12
-rw-r--r--net/dsa/Makefile2
-rw-r--r--net/dsa/dsa.c259
-rw-r--r--net/dsa/dsa2.c695
-rw-r--r--net/dsa/dsa_priv.h9
-rw-r--r--net/dsa/slave.c121
-rw-r--r--net/dsa/tag_brcm.c4
-rw-r--r--net/dsa/tag_dsa.c10
-rw-r--r--net/dsa/tag_edsa.c10
-rw-r--r--net/dsa/tag_trailer.c4
-rw-r--r--net/ieee802154/6lowpan/core.c30
-rw-r--r--net/ieee802154/6lowpan/rx.c2
-rw-r--r--net/ieee802154/6lowpan/tx.c113
-rw-r--r--net/ieee802154/core.c70
-rw-r--r--net/ieee802154/core.h2
-rw-r--r--net/ieee802154/nl802154.c54
-rw-r--r--net/ipv4/Kconfig16
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/cipso_ipv4.c88
-rw-r--r--net/ipv4/devinet.c23
-rw-r--r--net/ipv4/fib_frontend.c3
-rw-r--r--net/ipv4/fib_rules.c6
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/fou.c81
-rw-r--r--net/ipv4/gre_demux.c1
-rw-r--r--net/ipv4/inet_connection_sock.c7
-rw-r--r--net/ipv4/inet_diag.c25
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c5
-rw-r--r--net/ipv4/ip_forward.c4
-rw-r--r--net/ipv4/ip_gre.c52
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c8
-rw-r--r--net/ipv4/ip_tunnel.c2
-rw-r--r--net/ipv4/ip_tunnel_core.c11
-rw-r--r--net/ipv4/ip_vti.c15
-rw-r--r--net/ipv4/ipip.c137
-rw-r--r--net/ipv4/ipmr.c21
-rw-r--r--net/ipv4/netfilter/arp_tables.c88
-rw-r--r--net/ipv4/netfilter/ip_tables.c65
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c4
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c14
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c1
-rw-r--r--net/ipv4/route.c13
-rw-r--r--net/ipv4/tcp.c69
-rw-r--r--net/ipv4/tcp_dctcp.c4
-rw-r--r--net/ipv4/tcp_diag.c7
-rw-r--r--net/ipv4/tcp_fastopen.c1
-rw-r--r--net/ipv4/tcp_input.c39
-rw-r--r--net/ipv4/tcp_ipv4.c31
-rw-r--r--net/ipv4/tcp_nv.c476
-rw-r--r--net/ipv4/tcp_output.c23
-rw-r--r--net/ipv4/tcp_timer.c82
-rw-r--r--net/ipv4/tunnel4.c72
-rw-r--r--net/ipv4/udp.c1
-rw-r--r--net/ipv4/udp_tunnel.c61
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_policy.c10
-rw-r--r--net/ipv6/Makefile1
-rw-r--r--net/ipv6/addrconf.c325
-rw-r--r--net/ipv6/af_inet6.c15
-rw-r--r--net/ipv6/calipso.c1475
-rw-r--r--net/ipv6/exthdrs.c76
-rw-r--r--net/ipv6/exthdrs_core.c2
-rw-r--r--net/ipv6/fib6_rules.c6
-rw-r--r--net/ipv6/icmp.c78
-rw-r--r--net/ipv6/ila/ila.h3
-rw-r--r--net/ipv6/ila/ila_common.c16
-rw-r--r--net/ipv6/ila/ila_lwt.c4
-rw-r--r--net/ipv6/ila/ila_xlat.c8
-rw-r--r--net/ipv6/ip6_gre.c3
-rw-r--r--net/ipv6/ip6_icmp.c2
-rw-r--r--net/ipv6/ip6_input.c1
-rw-r--r--net/ipv6/ip6_output.c14
-rw-r--r--net/ipv6/ip6_vti.c19
-rw-r--r--net/ipv6/ip6mr.c34
-rw-r--r--net/ipv6/ipv6_sockglue.c1
-rw-r--r--net/ipv6/ndisc.c123
-rw-r--r--net/ipv6/netfilter/ip6_tables.c61
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c4
-rw-r--r--net/ipv6/netfilter/nft_chain_route_ipv6.c10
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c1
-rw-r--r--net/ipv6/ping.c35
-rw-r--r--net/ipv6/raw.c8
-rw-r--r--net/ipv6/route.c47
-rw-r--r--net/ipv6/sit.c140
-rw-r--r--net/ipv6/sysctl_net_ipv6.c19
-rw-r--r--net/ipv6/tcp_ipv6.c41
-rw-r--r--net/ipv6/udp.c9
-rw-r--r--net/ipv6/udplite.c1
-rw-r--r--net/ipv6/xfrm6_input.c15
-rw-r--r--net/ipv6/xfrm6_policy.c6
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/irda/af_irda.c5
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c8
-rw-r--r--net/iucv/af_iucv.c228
-rw-r--r--net/iucv/iucv.c36
-rw-r--r--net/kcm/kcmproc.c6
-rw-r--r--net/kcm/kcmsock.c8
-rw-r--r--net/l2tp/l2tp_core.c3
-rw-r--r--net/l2tp/l2tp_eth.c4
-rw-r--r--net/l2tp/l2tp_ip6.c8
-rw-r--r--net/l2tp/l2tp_ppp.c9
-rw-r--r--net/l3mdev/l3mdev.c64
-rw-r--r--net/mac80211/agg-rx.c18
-rw-r--r--net/mac80211/agg-tx.c8
-rw-r--r--net/mac80211/cfg.c1
-rw-r--r--net/mac80211/debugfs.c173
-rw-r--r--net/mac80211/debugfs_sta.c78
-rw-r--r--net/mac80211/driver-ops.h2
-rw-r--r--net/mac80211/ieee80211_i.h32
-rw-r--r--net/mac80211/iface.c26
-rw-r--r--net/mac80211/main.c10
-rw-r--r--net/mac80211/mesh.c30
-rw-r--r--net/mac80211/mesh_hwmp.c3
-rw-r--r--net/mac80211/mesh_pathtbl.c2
-rw-r--r--net/mac80211/mesh_plink.c16
-rw-r--r--net/mac80211/rx.c9
-rw-r--r--net/mac80211/scan.c42
-rw-r--r--net/mac80211/spectmgmt.c45
-rw-r--r--net/mac80211/sta_info.c16
-rw-r--r--net/mac80211/status.c14
-rw-r--r--net/mac80211/tdls.c8
-rw-r--r--net/mac80211/tx.c335
-rw-r--r--net/mac80211/util.c34
-rw-r--r--net/mpls/af_mpls.c11
-rw-r--r--net/ncsi/Kconfig12
-rw-r--r--net/ncsi/Makefile4
-rw-r--r--net/ncsi/internal.h328
-rw-r--r--net/ncsi/ncsi-aen.c193
-rw-r--r--net/ncsi/ncsi-cmd.c367
-rw-r--r--net/ncsi/ncsi-manage.c1205
-rw-r--r--net/ncsi/ncsi-pkt.h415
-rw-r--r--net/ncsi/ncsi-rsp.c1035
-rw-r--r--net/netfilter/Kconfig10
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c25
-rw-r--r--net/netfilter/nf_conntrack_core.c220
-rw-r--r--net/netfilter/nf_conntrack_expect.c2
-rw-r--r--net/netfilter/nf_conntrack_extend.c15
-rw-r--r--net/netfilter/nf_conntrack_ftp.c58
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c3
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c16
-rw-r--r--net/netfilter/nf_conntrack_helper.c127
-rw-r--r--net/netfilter/nf_conntrack_irc.c36
-rw-r--r--net/netfilter/nf_conntrack_labels.c28
-rw-r--r--net/netfilter/nf_conntrack_netlink.c20
-rw-r--r--net/netfilter/nf_conntrack_sane.c57
-rw-r--r--net/netfilter/nf_conntrack_sip.c79
-rw-r--r--net/netfilter/nf_conntrack_standalone.c54
-rw-r--r--net/netfilter/nf_conntrack_tftp.c48
-rw-r--r--net/netfilter/nf_log.c33
-rw-r--r--net/netfilter/nf_nat_core.c154
-rw-r--r--net/netfilter/nf_tables_api.c414
-rw-r--r--net/netfilter/nf_tables_netdev.c1
-rw-r--r--net/netfilter/nf_tables_trace.c2
-rw-r--r--net/netfilter/nfnetlink_acct.c23
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c85
-rw-r--r--net/netfilter/nfnetlink_log.c10
-rw-r--r--net/netfilter/nfnetlink_queue.c6
-rw-r--r--net/netfilter/nft_compat.c75
-rw-r--r--net/netfilter/nft_ct.c41
-rw-r--r--net/netfilter/nft_dynset.c7
-rw-r--r--net/netfilter/nft_exthdr.c11
-rw-r--r--net/netfilter/nft_hash.c24
-rw-r--r--net/netfilter/nft_log.c51
-rw-r--r--net/netfilter/nft_lookup.c43
-rw-r--r--net/netfilter/nft_meta.c26
-rw-r--r--net/netfilter/nft_rbtree.c36
-rw-r--r--net/netfilter/nft_reject.c16
-rw-r--r--net/netfilter/nft_reject_inet.c7
-rw-r--r--net/netfilter/x_tables.c53
-rw-r--r--net/netfilter/xt_NFLOG.c3
-rw-r--r--net/netfilter/xt_RATEEST.c2
-rw-r--r--net/netfilter/xt_TPROXY.c4
-rw-r--r--net/netfilter/xt_TRACE.c25
-rw-r--r--net/netfilter/xt_connlabel.c29
-rw-r--r--net/netfilter/xt_nfacct.c2
-rw-r--r--net/netfilter/xt_owner.c41
-rw-r--r--net/netfilter/xt_physdev.c8
-rw-r--r--net/netfilter/xt_tcpudp.c7
-rw-r--r--net/netlabel/Kconfig1
-rw-r--r--net/netlabel/Makefile2
-rw-r--r--net/netlabel/netlabel_calipso.c740
-rw-r--r--net/netlabel/netlabel_calipso.h151
-rw-r--r--net/netlabel/netlabel_domainhash.c293
-rw-r--r--net/netlabel/netlabel_domainhash.h17
-rw-r--r--net/netlabel/netlabel_kapi.c382
-rw-r--r--net/netlabel/netlabel_mgmt.c85
-rw-r--r--net/netlabel/netlabel_mgmt.h27
-rw-r--r--net/netlabel/netlabel_unlabeled.c5
-rw-r--r--net/netlabel/netlabel_user.c5
-rw-r--r--net/netlink/af_netlink.h14
-rw-r--r--net/nfc/digital_core.c28
-rw-r--r--net/nfc/digital_dep.c316
-rw-r--r--net/nfc/digital_technology.c11
-rw-r--r--net/nfc/hci/llc.c17
-rw-r--r--net/nfc/llcp_commands.c23
-rw-r--r--net/nfc/llcp_core.c9
-rw-r--r--net/openvswitch/actions.c40
-rw-r--r--net/openvswitch/conntrack.c81
-rw-r--r--net/openvswitch/datapath.c42
-rw-r--r--net/openvswitch/datapath.h5
-rw-r--r--net/openvswitch/flow_netlink.c9
-rw-r--r--net/openvswitch/vport-geneve.c9
-rw-r--r--net/openvswitch/vport-gre.c11
-rw-r--r--net/openvswitch/vport-internal_dev.c4
-rw-r--r--net/openvswitch/vport-vxlan.c9
-rw-r--r--net/openvswitch/vport.c1
-rw-r--r--net/packet/af_packet.c42
-rw-r--r--net/rds/bind.c6
-rw-r--r--net/rds/cong.c3
-rw-r--r--net/rds/connection.c329
-rw-r--r--net/rds/ib.c9
-rw-r--r--net/rds/ib.h8
-rw-r--r--net/rds/ib_cm.c9
-rw-r--r--net/rds/ib_rdma.c3
-rw-r--r--net/rds/ib_recv.c4
-rw-r--r--net/rds/ib_send.c4
-rw-r--r--net/rds/loop.c15
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/rdma_transport.c1
-rw-r--r--net/rds/rds.h178
-rw-r--r--net/rds/rds_single_path.h30
-rw-r--r--net/rds/recv.c106
-rw-r--r--net/rds/send.c356
-rw-r--r--net/rds/tcp.c160
-rw-r--r--net/rds/tcp.h23
-rw-r--r--net/rds/tcp_connect.c43
-rw-r--r--net/rds/tcp_listen.c76
-rw-r--r--net/rds/tcp_recv.c38
-rw-r--r--net/rds/tcp_send.c39
-rw-r--r--net/rds/threads.c105
-rw-r--r--net/rxrpc/Makefile37
-rw-r--r--net/rxrpc/af_rxrpc.c287
-rw-r--r--net/rxrpc/ar-internal.h516
-rw-r--r--net/rxrpc/call_accept.c477
-rw-r--r--net/rxrpc/call_event.c1302
-rw-r--r--net/rxrpc/call_object.c801
-rw-r--r--net/rxrpc/conn_client.c372
-rw-r--r--net/rxrpc/conn_event.c394
-rw-r--r--net/rxrpc/conn_object.c340
-rw-r--r--net/rxrpc/conn_service.c230
-rw-r--r--net/rxrpc/input.c757
-rw-r--r--net/rxrpc/insecure.c7
-rw-r--r--net/rxrpc/key.c1237
-rw-r--r--net/rxrpc/local_event.c116
-rw-r--r--net/rxrpc/local_object.c390
-rw-r--r--net/rxrpc/misc.c6
-rw-r--r--net/rxrpc/output.c721
-rw-r--r--net/rxrpc/peer_event.c281
-rw-r--r--net/rxrpc/peer_object.c315
-rw-r--r--net/rxrpc/proc.c192
-rw-r--r--net/rxrpc/recvmsg.c417
-rw-r--r--net/rxrpc/rxkad.c263
-rw-r--r--net/rxrpc/security.c168
-rw-r--r--net/rxrpc/skbuff.c165
-rw-r--r--net/rxrpc/sysctl.c12
-rw-r--r--net/rxrpc/utils.c46
-rw-r--r--net/sched/Kconfig10
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c306
-rw-r--r--net/sched/act_bpf.c41
-rw-r--r--net/sched/act_connmark.c30
-rw-r--r--net/sched/act_csum.c29
-rw-r--r--net/sched/act_gact.c31
-rw-r--r--net/sched/act_ife.c58
-rw-r--r--net/sched/act_ipt.c67
-rw-r--r--net/sched/act_mirred.c35
-rw-r--r--net/sched/act_nat.c29
-rw-r--r--net/sched/act_pedit.c36
-rw-r--r--net/sched/act_police.c105
-rw-r--r--net/sched/act_simple.c39
-rw-r--r--net/sched/act_skbedit.c62
-rw-r--r--net/sched/act_vlan.c41
-rw-r--r--net/sched/cls_api.c99
-rw-r--r--net/sched/cls_bpf.c7
-rw-r--r--net/sched/cls_flower.c65
-rw-r--r--net/sched/cls_matchall.c318
-rw-r--r--net/sched/sch_api.c30
-rw-r--r--net/sched/sch_atm.c33
-rw-r--r--net/sched/sch_blackhole.c5
-rw-r--r--net/sched/sch_cbq.c305
-rw-r--r--net/sched/sch_choke.c41
-rw-r--r--net/sched/sch_codel.c10
-rw-r--r--net/sched/sch_drr.c38
-rw-r--r--net/sched/sch_dsmark.c27
-rw-r--r--net/sched/sch_fifo.c18
-rw-r--r--net/sched/sch_fq.c29
-rw-r--r--net/sched/sch_fq_codel.c64
-rw-r--r--net/sched/sch_generic.c99
-rw-r--r--net/sched/sch_gred.c42
-rw-r--r--net/sched/sch_hfsc.c108
-rw-r--r--net/sched/sch_hhf.c24
-rw-r--r--net/sched/sch_htb.c68
-rw-r--r--net/sched/sch_mq.c2
-rw-r--r--net/sched/sch_mqprio.c11
-rw-r--r--net/sched/sch_multiq.c32
-rw-r--r--net/sched/sch_netem.c73
-rw-r--r--net/sched/sch_pie.c7
-rw-r--r--net/sched/sch_plug.c19
-rw-r--r--net/sched/sch_prio.c27
-rw-r--r--net/sched/sch_qfq.c66
-rw-r--r--net/sched/sch_red.c28
-rw-r--r--net/sched/sch_sfb.c10
-rw-r--r--net/sched/sch_sfq.c11
-rw-r--r--net/sched/sch_tbf.c34
-rw-r--r--net/sched/sch_teql.c4
-rw-r--r--net/sctp/Makefile3
-rw-r--r--net/sctp/associola.c1
-rw-r--r--net/sctp/chunk.c30
-rw-r--r--net/sctp/endpointola.c1
-rw-r--r--net/sctp/input.c96
-rw-r--r--net/sctp/inqueue.c72
-rw-r--r--net/sctp/ipv6.c15
-rw-r--r--net/sctp/offload.c119
-rw-r--r--net/sctp/output.c390
-rw-r--r--net/sctp/outqueue.c99
-rw-r--r--net/sctp/protocol.c9
-rw-r--r--net/sctp/sctp_diag.c76
-rw-r--r--net/sctp/sm_make_chunk.c32
-rw-r--r--net/sctp/sm_sideeffect.c4
-rw-r--r--net/sctp/sm_statefuns.c9
-rw-r--r--net/sctp/socket.c302
-rw-r--r--net/sctp/ulpevent.c17
-rw-r--r--net/sctp/ulpqueue.c4
-rw-r--r--net/sunrpc/auth.c8
-rw-r--r--net/sunrpc/auth_generic.c9
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c3
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c12
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c5
-rw-r--r--net/sunrpc/auth_null.c1
-rw-r--r--net/sunrpc/auth_unix.c1
-rw-r--r--net/sunrpc/cache.c2
-rw-r--r--net/sunrpc/clnt.c30
-rw-r--r--net/sunrpc/rpc_pipe.c8
-rw-r--r--net/sunrpc/sched.c67
-rw-r--r--net/sunrpc/svc.c8
-rw-r--r--net/sunrpc/svc_xprt.c51
-rw-r--r--net/sunrpc/svcsock.c150
-rw-r--r--net/sunrpc/xprt.c40
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c378
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c318
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c274
-rw-r--r--net/sunrpc/xprtrdma/transport.c40
-rw-r--r--net/sunrpc/xprtrdma/verbs.c271
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h111
-rw-r--r--net/sunrpc/xprtsock.c177
-rw-r--r--net/switchdev/switchdev.c5
-rw-r--r--net/tipc/Makefile2
-rw-r--r--net/tipc/addr.h6
-rw-r--r--net/tipc/bearer.c33
-rw-r--r--net/tipc/bearer.h3
-rw-r--r--net/tipc/core.c1
-rw-r--r--net/tipc/core.h15
-rw-r--r--net/tipc/discover.c5
-rw-r--r--net/tipc/link.c51
-rw-r--r--net/tipc/monitor.c804
-rw-r--r--net/tipc/monitor.h82
-rw-r--r--net/tipc/name_distr.c8
-rw-r--r--net/tipc/netlink.c27
-rw-r--r--net/tipc/netlink.h1
-rw-r--r--net/tipc/node.c213
-rw-r--r--net/tipc/node.h5
-rw-r--r--net/tipc/server.c3
-rw-r--r--net/tipc/udp_media.c29
-rw-r--r--net/unix/af_unix.c1
-rw-r--r--net/vmw_vsock/Kconfig20
-rw-r--r--net/vmw_vsock/Makefile6
-rw-r--r--net/vmw_vsock/af_vsock.c31
-rw-r--r--net/vmw_vsock/virtio_transport.c620
-rw-r--r--net/vmw_vsock/virtio_transport_common.c992
-rw-r--r--net/vmw_vsock/vmci_transport.c2
-rw-r--r--net/wireless/chan.c3
-rw-r--r--net/wireless/core.c34
-rw-r--r--net/wireless/core.h16
-rw-r--r--net/wireless/nl80211.c437
-rw-r--r--net/wireless/nl80211.h2
-rw-r--r--net/wireless/scan.c18
-rw-r--r--net/wireless/sme.c8
-rw-r--r--net/wireless/trace.h33
-rw-r--r--net/xfrm/xfrm_input.c14
-rw-r--r--net/xfrm/xfrm_policy.c4
-rw-r--r--net/xfrm/xfrm_state.c1
-rw-r--r--net/xfrm/xfrm_user.c22
509 files changed, 36609 insertions, 8783 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h
index 97ecc27ae..a67caee11 100644
--- a/net/6lowpan/6lowpan_i.h
+++ b/net/6lowpan/6lowpan_i.h
@@ -12,6 +12,10 @@ static inline bool lowpan_is_ll(const struct net_device *dev,
return lowpan_dev(dev)->lltype == lltype;
}
+extern const struct ndisc_ops lowpan_ndisc_ops;
+
+int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev);
+
#ifdef CONFIG_6LOWPAN_DEBUGFS
int lowpan_dev_debugfs_init(struct net_device *dev);
void lowpan_dev_debugfs_exit(struct net_device *dev);
diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile
index e44f3bf2d..12d131ab2 100644
--- a/net/6lowpan/Makefile
+++ b/net/6lowpan/Makefile
@@ -1,6 +1,6 @@
obj-$(CONFIG_6LOWPAN) += 6lowpan.o
-6lowpan-y := core.o iphc.o nhc.o
+6lowpan-y := core.o iphc.o nhc.o ndisc.o
6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o
#rfc6282 nhcs
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 7a240b3ea..5945f7e19 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <net/6lowpan.h>
+#include <net/addrconf.h>
#include "6lowpan_i.h"
@@ -33,6 +34,8 @@ int lowpan_register_netdevice(struct net_device *dev,
for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
lowpan_dev(dev)->ctx.table[i].id = i;
+ dev->ndisc_ops = &lowpan_ndisc_ops;
+
ret = register_netdevice(dev);
if (ret < 0)
return ret;
@@ -72,16 +75,61 @@ void lowpan_unregister_netdev(struct net_device *dev)
}
EXPORT_SYMBOL(lowpan_unregister_netdev);
+int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev)
+{
+ struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ /* Set short_addr autoconfiguration if short_addr is present only */
+ if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr))
+ return -1;
+
+ /* For either address format, all zero addresses MUST NOT be used */
+ if (wpan_dev->pan_id == cpu_to_le16(0x0000) &&
+ wpan_dev->short_addr == cpu_to_le16(0x0000))
+ return -1;
+
+ /* Alternatively, if no PAN ID is known, 16 zero bits may be used */
+ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
+ memset(eui, 0, 2);
+ else
+ ieee802154_le16_to_be16(eui, &wpan_dev->pan_id);
+
+ /* The "Universal/Local" (U/L) bit shall be set to zero */
+ eui[0] &= ~2;
+ eui[2] = 0;
+ eui[3] = 0xFF;
+ eui[4] = 0xFE;
+ eui[5] = 0;
+ ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr);
+ return 0;
+}
+
static int lowpan_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct inet6_dev *idev;
+ struct in6_addr addr;
int i;
if (dev->type != ARPHRD_6LOWPAN)
return NOTIFY_DONE;
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return NOTIFY_DONE;
+
switch (event) {
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ /* (802.15.4 6LoWPAN short address slaac handling */
+ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) &&
+ addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) {
+ __ipv6_addr_set_half(&addr.s6_addr32[0],
+ htonl(0xFE800000), 0);
+ addrconf_add_linklocal(idev, &addr, 0);
+ }
+ break;
case NETDEV_DOWN:
for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
@@ -112,8 +160,6 @@ static int __init lowpan_module_init(void)
return ret;
}
- request_module_nowait("ipv6");
-
request_module_nowait("nhc_dest");
request_module_nowait("nhc_fragment");
request_module_nowait("nhc_hop");
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index acbaa3db4..24915e0bb 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -245,6 +245,41 @@ static const struct file_operations lowpan_context_fops = {
.release = single_release,
};
+static int lowpan_short_addr_get(void *data, u64 *val)
+{
+ struct wpan_dev *wdev = data;
+
+ rtnl_lock();
+ *val = le16_to_cpu(wdev->short_addr);
+ rtnl_unlock();
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get,
+ NULL, "0x%04llx\n");
+
+static int lowpan_dev_debugfs_802154_init(const struct net_device *dev,
+ struct lowpan_dev *ldev)
+{
+ struct dentry *dentry, *root;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return 0;
+
+ root = debugfs_create_dir("ieee802154", ldev->iface_debugfs);
+ if (!root)
+ return -EINVAL;
+
+ dentry = debugfs_create_file("short_addr", 0444, root,
+ lowpan_802154_dev(dev)->wdev->ieee802154_ptr,
+ &lowpan_short_addr_fops);
+ if (!dentry)
+ return -EINVAL;
+
+ return 0;
+}
+
int lowpan_dev_debugfs_init(struct net_device *dev)
{
struct lowpan_dev *ldev = lowpan_dev(dev);
@@ -272,6 +307,10 @@ int lowpan_dev_debugfs_init(struct net_device *dev)
goto remove_root;
}
+ ret = lowpan_dev_debugfs_802154_init(dev, ldev);
+ if (ret < 0)
+ goto remove_root;
+
return 0;
remove_root:
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 8501dd532..79f1fa225 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -761,22 +761,75 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = {
[LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11,
};
-static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr,
+static inline bool
+lowpan_iphc_compress_ctx_802154_lladdr(const struct in6_addr *ipaddr,
+ const struct lowpan_iphc_ctx *ctx,
+ const void *lladdr)
+{
+ const struct ieee802154_addr *addr = lladdr;
+ unsigned char extended_addr[EUI64_ADDR_LEN];
+ bool lladdr_compress = false;
+ struct in6_addr tmp = {};
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr);
+ /* check for SAM/DAM = 11 */
+ memcpy(&tmp.s6_addr[8], &extended_addr, EUI64_ADDR_LEN);
+ /* second bit-flip (Universe/Local) is done according RFC2464 */
+ tmp.s6_addr[8] ^= 0x02;
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ case IEEE802154_ADDR_SHORT:
+ tmp.s6_addr[11] = 0xFF;
+ tmp.s6_addr[12] = 0xFE;
+ ieee802154_le16_to_be16(&tmp.s6_addr16[7],
+ &addr->short_addr);
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ default:
+ /* should never handled and filtered by 802154 6lowpan */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return lladdr_compress;
+}
+
+static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev,
+ const struct in6_addr *ipaddr,
const struct lowpan_iphc_ctx *ctx,
const unsigned char *lladdr, bool sam)
{
struct in6_addr tmp = {};
u8 dam;
- /* check for SAM/DAM = 11 */
- memcpy(&tmp.s6_addr[8], lladdr, 8);
- /* second bit-flip (Universe/Local) is done according RFC2464 */
- tmp.s6_addr[8] ^= 0x02;
- /* context information are always used */
- ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
- if (ipv6_addr_equal(&tmp, ipaddr)) {
- dam = LOWPAN_IPHC_DAM_11;
- goto out;
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_iphc_compress_ctx_802154_lladdr(ipaddr, ctx,
+ lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11;
+ goto out;
+ }
+ break;
+ default:
+ /* check for SAM/DAM = 11 */
+ memcpy(&tmp.s6_addr[8], lladdr, EUI64_ADDR_LEN);
+ /* second bit-flip (Universe/Local) is done according RFC2464 */
+ tmp.s6_addr[8] ^= 0x02;
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr)) {
+ dam = LOWPAN_IPHC_DAM_11;
+ goto out;
+ }
+ break;
}
memset(&tmp, 0, sizeof(tmp));
@@ -813,28 +866,85 @@ out:
return dam;
}
-static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr,
+static inline bool
+lowpan_iphc_compress_802154_lladdr(const struct in6_addr *ipaddr,
+ const void *lladdr)
+{
+ const struct ieee802154_addr *addr = lladdr;
+ unsigned char extended_addr[EUI64_ADDR_LEN];
+ bool lladdr_compress = false;
+ struct in6_addr tmp = {};
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr);
+ if (is_addr_mac_addr_based(ipaddr, extended_addr))
+ lladdr_compress = true;
+ break;
+ case IEEE802154_ADDR_SHORT:
+ /* fe:80::ff:fe00:XXXX
+ * \__/
+ * short_addr
+ *
+ * Universe/Local bit is zero.
+ */
+ tmp.s6_addr[0] = 0xFE;
+ tmp.s6_addr[1] = 0x80;
+ tmp.s6_addr[11] = 0xFF;
+ tmp.s6_addr[12] = 0xFE;
+ ieee802154_le16_to_be16(&tmp.s6_addr16[7],
+ &addr->short_addr);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ default:
+ /* should never handled and filtered by 802154 6lowpan */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return lladdr_compress;
+}
+
+static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct net_device *dev,
+ const struct in6_addr *ipaddr,
const unsigned char *lladdr, bool sam)
{
- u8 dam = LOWPAN_IPHC_DAM_00;
+ u8 dam = LOWPAN_IPHC_DAM_01;
- if (is_addr_mac_addr_based(ipaddr, lladdr)) {
- dam = LOWPAN_IPHC_DAM_11; /* 0-bits */
- pr_debug("address compression 0 bits\n");
- } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) {
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_iphc_compress_802154_lladdr(ipaddr, lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11; /* 0-bits */
+ pr_debug("address compression 0 bits\n");
+ goto out;
+ }
+ break;
+ default:
+ if (is_addr_mac_addr_based(ipaddr, lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11; /* 0-bits */
+ pr_debug("address compression 0 bits\n");
+ goto out;
+ }
+ break;
+ }
+
+ if (lowpan_is_iid_16_bit_compressable(ipaddr)) {
/* compress IID to 16 bits xxxx::XXXX */
lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2);
dam = LOWPAN_IPHC_DAM_10; /* 16-bits */
raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)",
*hc_ptr - 2, 2);
- } else {
- /* do not compress IID => xxxx::IID */
- lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8);
- dam = LOWPAN_IPHC_DAM_01; /* 64-bits */
- raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)",
- *hc_ptr - 8, 8);
+ goto out;
}
+ /* do not compress IID => xxxx::IID */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8);
+ raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)",
+ *hc_ptr - 8, 8);
+
+out:
+
if (sam)
return lowpan_iphc_dam_to_sam_value[dam];
else
@@ -1013,9 +1123,6 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
iphc0 = LOWPAN_DISPATCH_IPHC;
iphc1 = 0;
- raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN);
- raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN);
-
raw_dump_table(__func__, "sending raw skb network uncompressed packet",
skb->data, skb->len);
@@ -1088,14 +1195,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
iphc1 |= LOWPAN_IPHC_SAC;
} else {
if (sci) {
- iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr,
+ iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev,
+ &hdr->saddr,
&sci_entry, saddr,
true);
iphc1 |= LOWPAN_IPHC_SAC;
} else {
if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL &&
lowpan_is_linklocal_zero_padded(hdr->saddr)) {
- iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev,
&hdr->saddr,
saddr, true);
pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
@@ -1123,14 +1231,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
}
} else {
if (dci) {
- iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr,
+ iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev,
+ &hdr->daddr,
&dci_entry, daddr,
false);
iphc1 |= LOWPAN_IPHC_DAC;
} else {
if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL &&
lowpan_is_linklocal_zero_padded(hdr->daddr)) {
- iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev,
&hdr->daddr,
daddr, false);
pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n",
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
new file mode 100644
index 000000000..86450b7e2
--- /dev/null
+++ b/net/6lowpan/ndisc.c
@@ -0,0 +1,241 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * (C) 2016 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ */
+
+#include <net/6lowpan.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+
+#include "6lowpan_i.h"
+
+static int lowpan_ndisc_is_useropt(u8 nd_opt_type)
+{
+ return nd_opt_type == ND_OPT_6CO;
+}
+
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+#define NDISC_802154_SHORT_ADDR_LENGTH 1
+static int lowpan_ndisc_parse_802154_options(const struct net_device *dev,
+ struct nd_opt_hdr *nd_opt,
+ struct ndisc_options *ndopts)
+{
+ switch (nd_opt->nd_opt_len) {
+ case NDISC_802154_SHORT_ADDR_LENGTH:
+ if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type])
+ ND_PRINTK(2, warn,
+ "%s: duplicated short addr ND6 option found: type=%d\n",
+ __func__, nd_opt->nd_opt_type);
+ else
+ ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt;
+ return 1;
+ default:
+ /* all others will be handled by ndisc IPv6 option parsing */
+ return 0;
+ }
+}
+
+static int lowpan_ndisc_parse_options(const struct net_device *dev,
+ struct nd_opt_hdr *nd_opt,
+ struct ndisc_options *ndopts)
+{
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return 0;
+
+ switch (nd_opt->nd_opt_type) {
+ case ND_OPT_SOURCE_LL_ADDR:
+ case ND_OPT_TARGET_LL_ADDR:
+ return lowpan_ndisc_parse_802154_options(dev, nd_opt, ndopts);
+ default:
+ return 0;
+ }
+}
+
+static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
+ u8 icmp6_type,
+ const struct ndisc_options *ndopts)
+{
+ struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+ u8 *lladdr_short = NULL;
+
+ switch (icmp6_type) {
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_ROUTER_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ if (ndopts->nd_802154_opts_src_lladdr) {
+ lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ if (!lladdr_short) {
+ ND_PRINTK(2, warn,
+ "NA: invalid short link-layer address length\n");
+ return;
+ }
+ }
+ break;
+ case NDISC_REDIRECT:
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ if (ndopts->nd_802154_opts_tgt_lladdr) {
+ lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ if (!lladdr_short) {
+ ND_PRINTK(2, warn,
+ "NA: invalid short link-layer address length\n");
+ return;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ write_lock_bh(&n->lock);
+ if (lladdr_short) {
+ ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
+ if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr))
+ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+ } else {
+ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+ }
+ write_unlock_bh(&n->lock);
+}
+
+static void lowpan_ndisc_update(const struct net_device *dev,
+ struct neighbour *n, u32 flags, u8 icmp6_type,
+ const struct ndisc_options *ndopts)
+{
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return;
+
+ /* react on overrides only. TODO check if this is really right. */
+ if (flags & NEIGH_UPDATE_F_OVERRIDE)
+ lowpan_ndisc_802154_update(n, flags, icmp6_type, ndopts);
+}
+
+static int lowpan_ndisc_opt_addr_space(const struct net_device *dev,
+ u8 icmp6_type, struct neighbour *neigh,
+ u8 *ha_buf, u8 **ha)
+{
+ struct lowpan_802154_neigh *n;
+ struct wpan_dev *wpan_dev;
+ int addr_space = 0;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return 0;
+
+ switch (icmp6_type) {
+ case NDISC_REDIRECT:
+ n = lowpan_802154_neigh(neighbour_priv(neigh));
+
+ read_lock_bh(&neigh->lock);
+ if (lowpan_802154_is_valid_src_short_addr(n->short_addr)) {
+ memcpy(ha_buf, &n->short_addr,
+ IEEE802154_SHORT_ADDR_LEN);
+ read_unlock_bh(&neigh->lock);
+ addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0);
+ *ha = ha_buf;
+ } else {
+ read_unlock_bh(&neigh->lock);
+ }
+ break;
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ case NDISC_ROUTER_SOLICITATION:
+ wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr))
+ addr_space = __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0);
+ break;
+ default:
+ break;
+ }
+
+ return addr_space;
+}
+
+static void lowpan_ndisc_fill_addr_option(const struct net_device *dev,
+ struct sk_buff *skb, u8 icmp6_type,
+ const u8 *ha)
+{
+ struct wpan_dev *wpan_dev;
+ __be16 short_addr;
+ u8 opt_type;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return;
+
+ switch (icmp6_type) {
+ case NDISC_REDIRECT:
+ if (ha) {
+ ieee802154_le16_to_be16(&short_addr, ha);
+ __ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
+ &short_addr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ }
+ return;
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ opt_type = ND_OPT_TARGET_LL_ADDR;
+ break;
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ opt_type = ND_OPT_SOURCE_LL_ADDR;
+ break;
+ default:
+ return;
+ }
+
+ wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) {
+ ieee802154_le16_to_be16(&short_addr,
+ &wpan_dev->short_addr);
+ __ndisc_fill_addr_option(skb, opt_type, &short_addr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ }
+}
+
+static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net,
+ struct net_device *dev,
+ const struct prefix_info *pinfo,
+ struct inet6_dev *in6_dev,
+ struct in6_addr *addr,
+ int addr_type, u32 addr_flags,
+ bool sllao, bool tokenized,
+ __u32 valid_lft,
+ u32 prefered_lft,
+ bool dev_addr_generated)
+{
+ int err;
+
+ /* generates short based address for RA PIO's */
+ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && dev_addr_generated &&
+ !addrconf_ifid_802154_6lowpan(addr->s6_addr + 8, dev)) {
+ err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
+ addr, addr_type, addr_flags,
+ sllao, tokenized, valid_lft,
+ prefered_lft);
+ if (err)
+ ND_PRINTK(2, warn,
+ "RA: could not add a short address based address for prefix: %pI6c\n",
+ &pinfo->prefix);
+ }
+}
+#endif
+
+const struct ndisc_ops lowpan_ndisc_ops = {
+ .is_useropt = lowpan_ndisc_is_useropt,
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+ .parse_options = lowpan_ndisc_parse_options,
+ .update = lowpan_ndisc_update,
+ .opt_addr_space = lowpan_ndisc_opt_addr_space,
+ .fill_addr_option = lowpan_ndisc_fill_addr_option,
+ .prefix_rcv_add_addr = lowpan_ndisc_prefix_rcv_add_addr,
+#endif
+};
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 82a116ba5..8de138d33 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -169,7 +169,7 @@ int register_vlan_dev(struct net_device *dev)
if (err < 0)
goto out_uninit_mvrp;
- vlan->nest_level = dev_get_nest_level(real_dev, is_vlan_dev) + 1;
+ vlan->nest_level = dev_get_nest_level(real_dev) + 1;
err = register_netdevice(dev);
if (err < 0)
goto out_uninit_mvrp;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 516b0e732..fbfacd51a 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -792,6 +792,8 @@ static const struct net_device_ops vlan_netdev_ops = {
.ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
#endif
.ndo_fix_features = vlan_dev_fix_features,
+ .ndo_neigh_construct = netdev_default_l2upper_neigh_construct,
+ .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy,
.ndo_fdb_add = switchdev_port_fdb_add,
.ndo_fdb_del = switchdev_port_fdb_del,
.ndo_fdb_dump = switchdev_port_fdb_dump,
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 4acb1d541..f24b25c25 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -507,8 +507,8 @@ err_out:
/* wakeup anybody waiting for slots to pin pages */
wake_up(&vp_wq);
}
- kfree(in_pages);
- kfree(out_pages);
+ kvfree(in_pages);
+ kvfree(out_pages);
return err;
}
diff --git a/net/Kconfig b/net/Kconfig
index ff40562a7..c2cdbce62 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -237,6 +237,7 @@ source "net/hsr/Kconfig"
source "net/switchdev/Kconfig"
source "net/l3mdev/Kconfig"
source "net/qrtr/Kconfig"
+source "net/ncsi/Kconfig"
config RPS
bool
diff --git a/net/Makefile b/net/Makefile
index bdd14553a..9bd20bb86 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -79,3 +79,4 @@ ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
obj-y += l3mdev/
endif
obj-$(CONFIG_QRTR) += qrtr/
+obj-$(CONFIG_NET_NCSI) += ncsi/
diff --git a/net/atm/clip.c b/net/atm/clip.c
index e07f551a8..53b4ac09e 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -286,7 +286,7 @@ static const struct neigh_ops clip_neigh_ops = {
.connected_output = neigh_direct_output,
};
-static int clip_constructor(struct neighbour *neigh)
+static int clip_constructor(struct net_device *dev, struct neighbour *neigh)
{
struct atmarp_entry *entry = neighbour_priv(neigh);
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index f66930ee3..833bb145b 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -66,7 +66,7 @@ config BATMAN_ADV_NC
config BATMAN_ADV_MCAST
bool "Multicast optimisation"
- depends on BATMAN_ADV
+ depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
default n
help
This option enables the multicast optimisation which aims to
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 797cf2fc8..a83fc6c58 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -17,6 +17,7 @@
#
obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
+batman-adv-y += bat_algo.o
batman-adv-y += bat_iv_ogm.o
batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o
batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
@@ -31,12 +32,16 @@ batman-adv-y += gateway_common.o
batman-adv-y += hard-interface.o
batman-adv-y += hash.o
batman-adv-y += icmp_socket.o
+batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o
batman-adv-y += main.o
batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
+batman-adv-y += netlink.o
batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o
batman-adv-y += originator.o
batman-adv-y += routing.o
batman-adv-y += send.o
batman-adv-y += soft-interface.o
batman-adv-y += sysfs.o
+batman-adv-y += tp_meter.o
batman-adv-y += translation-table.o
+batman-adv-y += tvlv.o
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
new file mode 100644
index 000000000..81dbbf569
--- /dev/null
+++ b/net/batman-adv/bat_algo.c
@@ -0,0 +1,140 @@
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/printk.h>
+#include <linux/seq_file.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+
+#include "bat_algo.h"
+
+char batadv_routing_algo[20] = "BATMAN_IV";
+static struct hlist_head batadv_algo_list;
+
+/**
+ * batadv_algo_init - Initialize batman-adv algorithm management data structures
+ */
+void batadv_algo_init(void)
+{
+ INIT_HLIST_HEAD(&batadv_algo_list);
+}
+
+static struct batadv_algo_ops *batadv_algo_get(char *name)
+{
+ struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp;
+
+ hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) {
+ if (strcmp(bat_algo_ops_tmp->name, name) != 0)
+ continue;
+
+ bat_algo_ops = bat_algo_ops_tmp;
+ break;
+ }
+
+ return bat_algo_ops;
+}
+
+int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
+{
+ struct batadv_algo_ops *bat_algo_ops_tmp;
+
+ bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name);
+ if (bat_algo_ops_tmp) {
+ pr_info("Trying to register already registered routing algorithm: %s\n",
+ bat_algo_ops->name);
+ return -EEXIST;
+ }
+
+ /* all algorithms must implement all ops (for now) */
+ if (!bat_algo_ops->iface.enable ||
+ !bat_algo_ops->iface.disable ||
+ !bat_algo_ops->iface.update_mac ||
+ !bat_algo_ops->iface.primary_set ||
+ !bat_algo_ops->neigh.cmp ||
+ !bat_algo_ops->neigh.is_similar_or_better) {
+ pr_info("Routing algo '%s' does not implement required ops\n",
+ bat_algo_ops->name);
+ return -EINVAL;
+ }
+
+ INIT_HLIST_NODE(&bat_algo_ops->list);
+ hlist_add_head(&bat_algo_ops->list, &batadv_algo_list);
+
+ return 0;
+}
+
+int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
+{
+ struct batadv_algo_ops *bat_algo_ops;
+
+ bat_algo_ops = batadv_algo_get(name);
+ if (!bat_algo_ops)
+ return -EINVAL;
+
+ bat_priv->algo_ops = bat_algo_ops;
+
+ return 0;
+}
+
+int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct batadv_algo_ops *bat_algo_ops;
+
+ seq_puts(seq, "Available routing algorithms:\n");
+
+ hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
+ seq_printf(seq, " * %s\n", bat_algo_ops->name);
+ }
+
+ return 0;
+}
+
+static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
+{
+ struct batadv_algo_ops *bat_algo_ops;
+ char *algo_name = (char *)val;
+ size_t name_len = strlen(algo_name);
+
+ if (name_len > 0 && algo_name[name_len - 1] == '\n')
+ algo_name[name_len - 1] = '\0';
+
+ bat_algo_ops = batadv_algo_get(algo_name);
+ if (!bat_algo_ops) {
+ pr_err("Routing algorithm '%s' is not supported\n", algo_name);
+ return -EINVAL;
+ }
+
+ return param_set_copystring(algo_name, kp);
+}
+
+static const struct kernel_param_ops batadv_param_ops_ra = {
+ .set = batadv_param_set_ra,
+ .get = param_get_string,
+};
+
+static struct kparam_string batadv_param_string_ra = {
+ .maxlen = sizeof(batadv_routing_algo),
+ .string = batadv_routing_algo,
+};
+
+module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra,
+ 0644);
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 03dafd33d..860d773dd 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -18,32 +18,18 @@
#ifndef _NET_BATMAN_ADV_BAT_ALGO_H_
#define _NET_BATMAN_ADV_BAT_ALGO_H_
-struct batadv_priv;
+#include "main.h"
-int batadv_iv_init(void);
+#include <linux/types.h>
-#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+struct seq_file;
-int batadv_v_init(void);
-int batadv_v_mesh_init(struct batadv_priv *bat_priv);
-void batadv_v_mesh_free(struct batadv_priv *bat_priv);
+extern char batadv_routing_algo[];
+extern struct list_head batadv_hardif_list;
-#else
-
-static inline int batadv_v_init(void)
-{
- return 0;
-}
-
-static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv)
-{
- return 0;
-}
-
-static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv)
-{
-}
-
-#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+void batadv_algo_init(void);
+int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
+int batadv_algo_select(struct batadv_priv *bat_priv, char *name);
+int batadv_algo_seq_print_text(struct seq_file *seq, void *offset);
#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index ce2f20304..19b0abd6c 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -15,7 +15,7 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "bat_algo.h"
+#include "bat_iv_ogm.h"
#include "main.h"
#include <linux/atomic.h>
@@ -30,8 +30,9 @@
#include <linux/if_ether.h>
#include <linux/init.h>
#include <linux/jiffies.h>
-#include <linux/list.h>
+#include <linux/kernel.h>
#include <linux/kref.h>
+#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/pkt_sched.h>
@@ -48,15 +49,20 @@
#include <linux/types.h>
#include <linux/workqueue.h>
+#include "bat_algo.h"
#include "bitarray.h"
#include "hard-interface.h"
#include "hash.h"
+#include "log.h"
#include "network-coding.h"
#include "originator.h"
#include "packet.h"
#include "routing.h"
#include "send.h"
#include "translation-table.h"
+#include "tvlv.h"
+
+static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work);
/**
* enum batadv_dup_status - duplicate status
@@ -336,7 +342,8 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface,
{
struct batadv_neigh_node *neigh_node;
- neigh_node = batadv_neigh_node_new(orig_node, hard_iface, neigh_addr);
+ neigh_node = batadv_neigh_node_get_or_create(orig_node,
+ hard_iface, neigh_addr);
if (!neigh_node)
goto out;
@@ -730,7 +737,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
/* start timer for this packet */
INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work,
- batadv_send_outstanding_bat_ogm_packet);
+ batadv_iv_send_outstanding_bat_ogm_packet);
queue_delayed_work(batadv_event_workqueue,
&forw_packet_aggr->delayed_work,
send_time - jiffies);
@@ -937,6 +944,19 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
u16 tvlv_len = 0;
unsigned long send_time;
+ if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
+ (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
+ return;
+
+ /* the interface gets activated here to avoid race conditions between
+ * the moment of activating the interface in
+ * hardif_activate_interface() where the originator mac is set and
+ * outdated packets (especially uninitialized mac addresses) in the
+ * packet queue
+ */
+ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED)
+ hard_iface->if_status = BATADV_IF_ACTIVE;
+
primary_if = batadv_primary_if_get_selected(bat_priv);
if (hard_iface == primary_if) {
@@ -1778,6 +1798,45 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
batadv_orig_node_put(orig_node);
}
+static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_forw_packet *forw_packet;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = to_delayed_work(work);
+ forw_packet = container_of(delayed_work, struct batadv_forw_packet,
+ delayed_work);
+ bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
+ spin_lock_bh(&bat_priv->forw_bat_list_lock);
+ hlist_del(&forw_packet->list);
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ goto out;
+
+ batadv_iv_ogm_emit(forw_packet);
+
+ /* we have to have at least one packet in the queue to determine the
+ * queues wake up time unless we are shutting down.
+ *
+ * only re-schedule if this is the "original" copy, e.g. the OGM of the
+ * primary interface should only be rescheduled once per period, but
+ * this function will be called for the forw_packet instances of the
+ * other secondary interfaces as well.
+ */
+ if (forw_packet->own &&
+ forw_packet->if_incoming == forw_packet->if_outgoing)
+ batadv_iv_ogm_schedule(forw_packet->if_incoming);
+
+out:
+ /* don't count own packet */
+ if (!forw_packet->own)
+ atomic_inc(&bat_priv->batman_queue_left);
+
+ batadv_forw_packet_free(forw_packet);
+}
+
static int batadv_iv_ogm_receive(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming)
{
@@ -1794,7 +1853,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
/* did we receive a B.A.T.M.A.N. IV OGM packet on an interface
* that does not have B.A.T.M.A.N. IV enabled ?
*/
- if (bat_priv->bat_algo_ops->bat_ogm_emit != batadv_iv_ogm_emit)
+ if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable)
return NET_RX_DROP;
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
@@ -2052,21 +2111,32 @@ out:
return ret;
}
+static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
+{
+ /* begin scheduling originator messages on that interface */
+ batadv_iv_ogm_schedule(hard_iface);
+}
+
static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.name = "BATMAN_IV",
- .bat_iface_enable = batadv_iv_ogm_iface_enable,
- .bat_iface_disable = batadv_iv_ogm_iface_disable,
- .bat_iface_update_mac = batadv_iv_ogm_iface_update_mac,
- .bat_primary_iface_set = batadv_iv_ogm_primary_iface_set,
- .bat_ogm_schedule = batadv_iv_ogm_schedule,
- .bat_ogm_emit = batadv_iv_ogm_emit,
- .bat_neigh_cmp = batadv_iv_ogm_neigh_cmp,
- .bat_neigh_is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
- .bat_neigh_print = batadv_iv_neigh_print,
- .bat_orig_print = batadv_iv_ogm_orig_print,
- .bat_orig_free = batadv_iv_ogm_orig_free,
- .bat_orig_add_if = batadv_iv_ogm_orig_add_if,
- .bat_orig_del_if = batadv_iv_ogm_orig_del_if,
+ .iface = {
+ .activate = batadv_iv_iface_activate,
+ .enable = batadv_iv_ogm_iface_enable,
+ .disable = batadv_iv_ogm_iface_disable,
+ .update_mac = batadv_iv_ogm_iface_update_mac,
+ .primary_set = batadv_iv_ogm_primary_iface_set,
+ },
+ .neigh = {
+ .cmp = batadv_iv_ogm_neigh_cmp,
+ .is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
+ .print = batadv_iv_neigh_print,
+ },
+ .orig = {
+ .print = batadv_iv_ogm_orig_print,
+ .free = batadv_iv_ogm_orig_free,
+ .add_if = batadv_iv_ogm_orig_add_if,
+ .del_if = batadv_iv_ogm_orig_del_if,
+ },
};
int __init batadv_iv_init(void)
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
new file mode 100644
index 000000000..b9f3550fa
--- /dev/null
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -0,0 +1,25 @@
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_
+#define _BATMAN_ADV_BATADV_IV_OGM_H_
+
+#include "main.h"
+
+int batadv_iv_init(void);
+
+#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 0a12e5cdd..0366cbf5e 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -15,7 +15,7 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "bat_algo.h"
+#include "bat_v.h"
#include "main.h"
#include <linux/atomic.h>
@@ -31,6 +31,7 @@
#include <linux/types.h>
#include <linux/workqueue.h>
+#include "bat_algo.h"
#include "bat_v_elp.h"
#include "bat_v_ogm.h"
#include "hard-interface.h"
@@ -70,11 +71,6 @@ static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
if (ret < 0)
batadv_v_elp_iface_disable(hard_iface);
- /* enable link throughput auto-detection by setting the throughput
- * override to zero
- */
- atomic_set(&hard_iface->bat_v.throughput_override, 0);
-
return ret;
}
@@ -119,14 +115,6 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
batadv_v_elp_throughput_metric_update);
}
-static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface)
-{
-}
-
-static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet)
-{
-}
-
/**
* batadv_v_orig_print_neigh - print neighbors for the originator table
* @orig_node: the orig_node for which the neighbors are printed
@@ -334,21 +322,39 @@ err_ifinfo1:
static struct batadv_algo_ops batadv_batman_v __read_mostly = {
.name = "BATMAN_V",
- .bat_iface_activate = batadv_v_iface_activate,
- .bat_iface_enable = batadv_v_iface_enable,
- .bat_iface_disable = batadv_v_iface_disable,
- .bat_iface_update_mac = batadv_v_iface_update_mac,
- .bat_primary_iface_set = batadv_v_primary_iface_set,
- .bat_hardif_neigh_init = batadv_v_hardif_neigh_init,
- .bat_ogm_emit = batadv_v_ogm_emit,
- .bat_ogm_schedule = batadv_v_ogm_schedule,
- .bat_orig_print = batadv_v_orig_print,
- .bat_neigh_cmp = batadv_v_neigh_cmp,
- .bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob,
- .bat_neigh_print = batadv_v_neigh_print,
+ .iface = {
+ .activate = batadv_v_iface_activate,
+ .enable = batadv_v_iface_enable,
+ .disable = batadv_v_iface_disable,
+ .update_mac = batadv_v_iface_update_mac,
+ .primary_set = batadv_v_primary_iface_set,
+ },
+ .neigh = {
+ .hardif_init = batadv_v_hardif_neigh_init,
+ .cmp = batadv_v_neigh_cmp,
+ .is_similar_or_better = batadv_v_neigh_is_sob,
+ .print = batadv_v_neigh_print,
+ },
+ .orig = {
+ .print = batadv_v_orig_print,
+ },
};
/**
+ * batadv_v_hardif_init - initialize the algorithm specific fields in the
+ * hard-interface object
+ * @hard_iface: the hard-interface to initialize
+ */
+void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface)
+{
+ /* enable link throughput auto-detection by setting the throughput
+ * override to zero
+ */
+ atomic_set(&hard_iface->bat_v.throughput_override, 0);
+ atomic_set(&hard_iface->bat_v.elp_interval, 500);
+}
+
+/**
* batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a
* mesh
* @bat_priv: the object representing the mesh interface to initialise
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
new file mode 100644
index 000000000..83b776397
--- /dev/null
+++ b/net/batman-adv/bat_v.h
@@ -0,0 +1,52 @@
+/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Linus LĂ¼ssing
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_BAT_V_H_
+#define _NET_BATMAN_ADV_BAT_V_H_
+
+#include "main.h"
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+
+int batadv_v_init(void);
+void batadv_v_hardif_init(struct batadv_hard_iface *hardif);
+int batadv_v_mesh_init(struct batadv_priv *bat_priv);
+void batadv_v_mesh_free(struct batadv_priv *bat_priv);
+
+#else
+
+static inline int batadv_v_init(void)
+{
+ return 0;
+}
+
+static inline void batadv_v_hardif_init(struct batadv_hard_iface *hardif)
+{
+}
+
+static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv)
+{
+}
+
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
+#endif /* _NET_BATMAN_ADV_BAT_V_H_ */
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index df42eb136..ee08540ce 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -43,6 +43,7 @@
#include "bat_algo.h"
#include "bat_v_ogm.h"
#include "hard-interface.h"
+#include "log.h"
#include "originator.h"
#include "packet.h"
#include "routing.h"
@@ -334,7 +335,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
goto out;
skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN);
- elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
+ elp_buff = skb_put(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
elp_packet = (struct batadv_elp_packet *)elp_buff;
memset(elp_packet, 0, BATADV_ELP_HLEN);
@@ -344,7 +345,6 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
/* randomize initial seqno to avoid collision */
get_random_bytes(&random_seqno, sizeof(random_seqno));
atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno);
- atomic_set(&hard_iface->bat_v.elp_interval, 500);
/* assume full-duplex by default */
hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
@@ -443,7 +443,8 @@ static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv,
if (!orig_neigh)
return;
- neigh = batadv_neigh_node_new(orig_neigh, if_incoming, neigh_addr);
+ neigh = batadv_neigh_node_get_or_create(orig_neigh,
+ if_incoming, neigh_addr);
if (!neigh)
goto orig_free;
@@ -503,7 +504,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
/* did we receive a B.A.T.M.A.N. V ELP packet on an interface
* that does not have B.A.T.M.A.N. V ELP enabled ?
*/
- if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0)
+ if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
return NET_RX_DROP;
elp_packet = (struct batadv_elp_packet *)skb->data;
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index cc130b2d0..be17c0b13 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -15,11 +15,11 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
-
#ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_
#define _NET_BATMAN_ADV_BAT_V_ELP_H_
+#include "main.h"
+
struct sk_buff;
struct work_struct;
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 473ebb9a0..6fbba4eb0 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -39,13 +39,16 @@
#include <linux/types.h>
#include <linux/workqueue.h>
+#include "bat_algo.h"
#include "hard-interface.h"
#include "hash.h"
+#include "log.h"
#include "originator.h"
#include "packet.h"
#include "routing.h"
#include "send.h"
#include "translation-table.h"
+#include "tvlv.h"
/**
* batadv_v_ogm_orig_get - retrieve and possibly create an originator node
@@ -683,8 +686,8 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
if (!orig_node)
return;
- neigh_node = batadv_neigh_node_new(orig_node, if_incoming,
- ethhdr->h_source);
+ neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming,
+ ethhdr->h_source);
if (!neigh_node)
goto out;
@@ -751,7 +754,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
/* did we receive a OGM2 packet on an interface that does not have
* B.A.T.M.A.N. V enabled ?
*/
- if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0)
+ if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
return NET_RX_DROP;
if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index d849c75ad..4c4d45caa 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -18,10 +18,10 @@
#ifndef _BATMAN_ADV_BATADV_V_OGM_H_
#define _BATMAN_ADV_BATADV_V_OGM_H_
+#include "main.h"
+
#include <linux/types.h>
-struct batadv_hard_iface;
-struct batadv_priv;
struct sk_buff;
int batadv_v_ogm_init(struct batadv_priv *bat_priv);
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index a0c791383..032271421 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -20,6 +20,8 @@
#include <linux/bitmap.h>
+#include "log.h"
+
/* shift the packet array by n places. */
static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
{
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 825a5cdf4..ad2ffe16d 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -48,6 +48,7 @@
#include "hard-interface.h"
#include "hash.h"
+#include "log.h"
#include "originator.h"
#include "packet.h"
#include "sysfs.h"
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 952900466..1d68b6e63 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -18,245 +18,33 @@
#include "debugfs.h"
#include "main.h"
-#include <linux/compiler.h>
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/export.h>
-#include <linux/fcntl.h>
#include <linux/fs.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/netdevice.h>
-#include <linux/poll.h>
#include <linux/printk.h>
#include <linux/sched.h> /* for linux/wait.h */
#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/stringify.h>
#include <linux/sysfs.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
-#include <stdarg.h>
+#include "bat_algo.h"
#include "bridge_loop_avoidance.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
#include "icmp_socket.h"
+#include "log.h"
+#include "multicast.h"
#include "network-coding.h"
#include "originator.h"
#include "translation-table.h"
static struct dentry *batadv_debugfs;
-#ifdef CONFIG_BATMAN_ADV_DEBUG
-#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1)
-
-static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN;
-
-static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log,
- size_t idx)
-{
- return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK];
-}
-
-static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log,
- char c)
-{
- char *char_addr;
-
- char_addr = batadv_log_char_addr(debug_log, debug_log->log_end);
- *char_addr = c;
- debug_log->log_end++;
-
- if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len)
- debug_log->log_start = debug_log->log_end - batadv_log_buff_len;
-}
-
-__printf(2, 3)
-static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
- const char *fmt, ...)
-{
- va_list args;
- static char debug_log_buf[256];
- char *p;
-
- if (!debug_log)
- return 0;
-
- spin_lock_bh(&debug_log->lock);
- va_start(args, fmt);
- vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args);
- va_end(args);
-
- for (p = debug_log_buf; *p != 0; p++)
- batadv_emit_log_char(debug_log, *p);
-
- spin_unlock_bh(&debug_log->lock);
-
- wake_up(&debug_log->queue_wait);
-
- return 0;
-}
-
-int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
-{
- va_list args;
- char tmp_log_buf[256];
-
- va_start(args, fmt);
- vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args);
- batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s",
- jiffies_to_msecs(jiffies), tmp_log_buf);
- va_end(args);
-
- return 0;
-}
-
-static int batadv_log_open(struct inode *inode, struct file *file)
-{
- if (!try_module_get(THIS_MODULE))
- return -EBUSY;
-
- nonseekable_open(inode, file);
- file->private_data = inode->i_private;
- return 0;
-}
-
-static int batadv_log_release(struct inode *inode, struct file *file)
-{
- module_put(THIS_MODULE);
- return 0;
-}
-
-static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log)
-{
- return !(debug_log->log_start - debug_log->log_end);
-}
-
-static ssize_t batadv_log_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct batadv_priv *bat_priv = file->private_data;
- struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
- int error, i = 0;
- char *char_addr;
- char c;
-
- if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log))
- return -EAGAIN;
-
- if (!buf)
- return -EINVAL;
-
- if (count == 0)
- return 0;
-
- if (!access_ok(VERIFY_WRITE, buf, count))
- return -EFAULT;
-
- error = wait_event_interruptible(debug_log->queue_wait,
- (!batadv_log_empty(debug_log)));
-
- if (error)
- return error;
-
- spin_lock_bh(&debug_log->lock);
-
- while ((!error) && (i < count) &&
- (debug_log->log_start != debug_log->log_end)) {
- char_addr = batadv_log_char_addr(debug_log,
- debug_log->log_start);
- c = *char_addr;
-
- debug_log->log_start++;
-
- spin_unlock_bh(&debug_log->lock);
-
- error = __put_user(c, buf);
-
- spin_lock_bh(&debug_log->lock);
-
- buf++;
- i++;
- }
-
- spin_unlock_bh(&debug_log->lock);
-
- if (!error)
- return i;
-
- return error;
-}
-
-static unsigned int batadv_log_poll(struct file *file, poll_table *wait)
-{
- struct batadv_priv *bat_priv = file->private_data;
- struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
-
- poll_wait(file, &debug_log->queue_wait, wait);
-
- if (!batadv_log_empty(debug_log))
- return POLLIN | POLLRDNORM;
-
- return 0;
-}
-
-static const struct file_operations batadv_log_fops = {
- .open = batadv_log_open,
- .release = batadv_log_release,
- .read = batadv_log_read,
- .poll = batadv_log_poll,
- .llseek = no_llseek,
-};
-
-static int batadv_debug_log_setup(struct batadv_priv *bat_priv)
-{
- struct dentry *d;
-
- if (!bat_priv->debug_dir)
- goto err;
-
- bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC);
- if (!bat_priv->debug_log)
- goto err;
-
- spin_lock_init(&bat_priv->debug_log->lock);
- init_waitqueue_head(&bat_priv->debug_log->queue_wait);
-
- d = debugfs_create_file("log", S_IFREG | S_IRUSR,
- bat_priv->debug_dir, bat_priv,
- &batadv_log_fops);
- if (!d)
- goto err;
-
- return 0;
-
-err:
- return -ENOMEM;
-}
-
-static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
-{
- kfree(bat_priv->debug_log);
- bat_priv->debug_log = NULL;
-}
-#else /* CONFIG_BATMAN_ADV_DEBUG */
-static int batadv_debug_log_setup(struct batadv_priv *bat_priv)
-{
- return 0;
-}
-
-static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
-{
-}
-#endif
-
static int batadv_algorithms_open(struct inode *inode, struct file *file)
{
return single_open(file, batadv_algo_seq_print_text, NULL);
@@ -363,6 +151,22 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
}
#endif
+#ifdef CONFIG_BATMAN_ADV_MCAST
+/**
+ * batadv_mcast_flags_open - prepare file handler for reads from mcast_flags
+ * @inode: inode which was opened
+ * @file: file handle to be initialized
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_mcast_flags_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_mcast_flags_seq_print_text, net_dev);
+}
+#endif
+
#define BATADV_DEBUGINFO(_name, _mode, _open) \
struct batadv_debuginfo batadv_debuginfo_##_name = { \
.attr = { \
@@ -407,6 +211,9 @@ static BATADV_DEBUGINFO(transtable_local, S_IRUGO,
#ifdef CONFIG_BATMAN_ADV_NC
static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open);
#endif
+#ifdef CONFIG_BATMAN_ADV_MCAST
+static BATADV_DEBUGINFO(mcast_flags, S_IRUGO, batadv_mcast_flags_open);
+#endif
static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
&batadv_debuginfo_neighbors,
@@ -424,6 +231,9 @@ static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
#ifdef CONFIG_BATMAN_ADV_NC
&batadv_debuginfo_nc_nodes,
#endif
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ &batadv_debuginfo_mcast_flags,
+#endif
NULL,
};
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index aee3b3991..b1cc8bfe1 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -45,9 +45,11 @@
#include "hard-interface.h"
#include "hash.h"
+#include "log.h"
#include "originator.h"
#include "send.h"
#include "translation-table.h"
+#include "tvlv.h"
static void batadv_dat_purge(struct work_struct *work);
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 65536db1b..0934730fb 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -27,7 +27,6 @@
#include <linux/kernel.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
-#include <linux/pkt_sched.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -414,7 +413,7 @@ static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
if (!skb_fragment)
goto err;
- skb->priority = TC_PRIO_CONTROL;
+ skb_fragment->priority = skb->priority;
/* Eat the last mtu-bytes of the skb */
skb_reserve(skb_fragment, header_size + ETH_HLEN);
@@ -434,11 +433,12 @@ err:
* @orig_node: final destination of the created fragments
* @neigh_node: next-hop of the created fragments
*
- * Return: true on success, false otherwise.
+ * Return: the netdev tx status or -1 in case of error.
+ * When -1 is returned the skb is not consumed.
*/
-bool batadv_frag_send_packet(struct sk_buff *skb,
- struct batadv_orig_node *orig_node,
- struct batadv_neigh_node *neigh_node)
+int batadv_frag_send_packet(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node)
{
struct batadv_priv *bat_priv;
struct batadv_hard_iface *primary_if = NULL;
@@ -447,7 +447,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
unsigned int header_size = sizeof(frag_header);
unsigned int max_fragment_size, max_packet_size;
- bool ret = false;
+ int ret = -1;
/* To avoid merge and refragmentation at next-hops we never send
* fragments larger than BATADV_FRAG_MAX_FRAG_SIZE
@@ -458,12 +458,12 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
/* Don't even try to fragment, if we need more than 16 fragments */
if (skb->len > max_packet_size)
- goto out_err;
+ goto out;
bat_priv = orig_node->bat_priv;
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
- goto out_err;
+ goto out;
/* Create one header to be copied to all fragments */
frag_header.packet_type = BATADV_UNICAST_FRAG;
@@ -473,6 +473,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
frag_header.reserved = 0;
frag_header.no = 0;
frag_header.total_size = htons(skb->len);
+
+ /* skb->priority values from 256->263 are magic values to
+ * directly indicate a specific 802.1d priority. This is used
+ * to allow 802.1d priority to be passed directly in from VLAN
+ * tags, etc.
+ */
+ if (skb->priority >= 256 && skb->priority <= 263)
+ frag_header.priority = skb->priority - 256;
+
ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr);
ether_addr_copy(frag_header.dest, orig_node->orig);
@@ -480,23 +489,33 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
while (skb->len > max_fragment_size) {
skb_fragment = batadv_frag_create(skb, &frag_header, mtu);
if (!skb_fragment)
- goto out_err;
+ goto out;
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
skb_fragment->len + ETH_HLEN);
- batadv_send_unicast_skb(skb_fragment, neigh_node);
+ ret = batadv_send_unicast_skb(skb_fragment, neigh_node);
+ if (ret != NET_XMIT_SUCCESS) {
+ /* return -1 so that the caller can free the original
+ * skb
+ */
+ ret = -1;
+ goto out;
+ }
+
frag_header.no++;
/* The initial check in this function should cover this case */
- if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1)
- goto out_err;
+ if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) {
+ ret = -1;
+ goto out;
+ }
}
/* Make room for the fragment header. */
if (batadv_skb_head_push(skb, header_size) < 0 ||
pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0)
- goto out_err;
+ goto out;
memcpy(skb->data, &frag_header, header_size);
@@ -504,11 +523,9 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
skb->len + ETH_HLEN);
- batadv_send_unicast_skb(skb, neigh_node);
-
- ret = true;
+ ret = batadv_send_unicast_skb(skb, neigh_node);
-out_err:
+out:
if (primary_if)
batadv_hardif_put(primary_if);
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 9ff77c7ef..3202fe329 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -34,9 +34,9 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
struct batadv_orig_node *orig_node_src);
bool batadv_frag_skb_buffer(struct sk_buff **skb,
struct batadv_orig_node *orig_node);
-bool batadv_frag_send_packet(struct sk_buff *skb,
- struct batadv_orig_node *orig_node,
- struct batadv_neigh_node *neigh_node);
+int batadv_frag_send_packet(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node);
/**
* batadv_frag_check_entry - check if a list of fragments has timed out
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 5839c569f..63a805d3f 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -42,6 +42,7 @@
#include "gateway_common.h"
#include "hard-interface.h"
+#include "log.h"
#include "originator.h"
#include "packet.h"
#include "routing.h"
@@ -192,7 +193,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
tq_avg = router_ifinfo->bat_iv.tq_avg;
- switch (atomic_read(&bat_priv->gw_sel_class)) {
+ switch (atomic_read(&bat_priv->gw.sel_class)) {
case 1: /* fast connection */
tmp_gw_factor = tq_avg * tq_avg;
tmp_gw_factor *= gw_node->bandwidth_down;
@@ -255,7 +256,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
{
struct batadv_gw_node *curr_gw;
- if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT)
+ if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT)
return;
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
@@ -283,7 +284,7 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
struct batadv_neigh_ifinfo *router_ifinfo = NULL;
char gw_addr[18] = { '\0' };
- if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT)
+ if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT)
goto out;
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
@@ -402,8 +403,8 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv,
/* if the routing class is greater than 3 the value tells us how much
* greater the TQ value of the new gateway must be
*/
- if ((atomic_read(&bat_priv->gw_sel_class) > 3) &&
- (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw_sel_class)))
+ if ((atomic_read(&bat_priv->gw.sel_class) > 3) &&
+ (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class)))
goto out;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -638,8 +639,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
goto out;
seq_printf(seq,
- " %-12s (%s/%i) %17s [%10s]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n",
- "Gateway", "#", BATADV_TQ_MAX_VALUE, "Nexthop", "outgoingIF",
+ " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n",
BATADV_SOURCE_VERSION, primary_if->net_dev->name,
primary_if->net_dev->dev_addr, net_dev->name);
@@ -821,7 +821,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
if (!gw_node)
goto out;
- switch (atomic_read(&bat_priv->gw_mode)) {
+ switch (atomic_read(&bat_priv->gw.mode)) {
case BATADV_GW_MODE_SERVER:
/* If we are a GW then we are our best GW. We can artificially
* set the tq towards ourself as the maximum value
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 442304788..d7bc6a87b 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -19,8 +19,8 @@
#include "main.h"
#include <linux/atomic.h>
-#include <linux/errno.h>
#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/math64.h>
#include <linux/netdevice.h>
@@ -28,7 +28,9 @@
#include <linux/string.h>
#include "gateway_client.h"
+#include "log.h"
#include "packet.h"
+#include "tvlv.h"
/**
* batadv_parse_throughput - parse supplied string buffer to extract throughput
@@ -144,7 +146,7 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv)
u32 down, up;
char gw_mode;
- gw_mode = atomic_read(&bat_priv->gw_mode);
+ gw_mode = atomic_read(&bat_priv->gw.mode);
switch (gw_mode) {
case BATADV_GW_MODE_OFF:
@@ -241,8 +243,8 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
/* restart gateway selection if fast or late switching was enabled */
if ((gateway.bandwidth_down != 0) &&
- (atomic_read(&bat_priv->gw_mode) == BATADV_GW_MODE_CLIENT) &&
- (atomic_read(&bat_priv->gw_sel_class) > 2))
+ (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) &&
+ (atomic_read(&bat_priv->gw.sel_class) > 2))
batadv_gw_check_election(bat_priv, orig);
}
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 8c2f39962..1f9080840 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -23,9 +23,9 @@
#include <linux/byteorder/generic.h>
#include <linux/errno.h>
#include <linux/fs.h>
+#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
-#include <linux/if.h>
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
@@ -37,10 +37,12 @@
#include <linux/spinlock.h>
#include <linux/workqueue.h>
+#include "bat_v.h"
#include "bridge_loop_avoidance.h"
#include "debugfs.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
+#include "log.h"
#include "originator.h"
#include "packet.h"
#include "send.h"
@@ -245,7 +247,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
if (!new_hard_iface)
goto out;
- bat_priv->bat_algo_ops->bat_primary_iface_set(new_hard_iface);
+ bat_priv->algo_ops->iface.primary_set(new_hard_iface);
batadv_primary_if_update_addr(bat_priv, curr_hard_iface);
out:
@@ -392,7 +394,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
bat_priv = netdev_priv(hard_iface->soft_iface);
- bat_priv->bat_algo_ops->bat_iface_update_mac(hard_iface);
+ bat_priv->algo_ops->iface.update_mac(hard_iface);
hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED;
/* the first active interface becomes our primary interface or
@@ -407,8 +409,8 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
batadv_update_min_mtu(hard_iface->soft_iface);
- if (bat_priv->bat_algo_ops->bat_iface_activate)
- bat_priv->bat_algo_ops->bat_iface_activate(hard_iface);
+ if (bat_priv->algo_ops->iface.activate)
+ bat_priv->algo_ops->iface.activate(hard_iface);
out:
if (primary_if)
@@ -506,7 +508,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
if (ret)
goto err_dev;
- ret = bat_priv->bat_algo_ops->bat_iface_enable(hard_iface);
+ ret = bat_priv->algo_ops->iface.enable(hard_iface);
if (ret < 0)
goto err_upper;
@@ -515,7 +517,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
hard_iface->if_status = BATADV_IF_INACTIVE;
ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces);
if (ret < 0) {
- bat_priv->bat_algo_ops->bat_iface_disable(hard_iface);
+ bat_priv->algo_ops->iface.disable(hard_iface);
bat_priv->num_ifaces--;
hard_iface->if_status = BATADV_IF_NOT_IN_USE;
goto err_upper;
@@ -553,9 +555,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
batadv_hardif_recalc_extra_skbroom(soft_iface);
- /* begin scheduling originator messages on that interface */
- batadv_schedule_bat_ogm(hard_iface);
-
out:
return 0;
@@ -599,7 +598,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
batadv_hardif_put(new_if);
}
- bat_priv->bat_algo_ops->bat_iface_disable(hard_iface);
+ bat_priv->algo_ops->iface.disable(hard_iface);
hard_iface->if_status = BATADV_IF_NOT_IN_USE;
/* delete all references to this hard_iface */
@@ -686,6 +685,8 @@ batadv_hardif_add_interface(struct net_device *net_dev)
if (batadv_is_wifi_netdev(net_dev))
hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
+ batadv_v_hardif_init(hard_iface);
+
/* extra reference for return */
kref_init(&hard_iface->refcount);
kref_get(&hard_iface->refcount);
@@ -782,7 +783,7 @@ static int batadv_hard_if_event(struct notifier_block *this,
batadv_check_known_mac_addr(hard_iface->net_dev);
bat_priv = netdev_priv(hard_iface->soft_iface);
- bat_priv->bat_algo_ops->bat_iface_update_mac(hard_iface);
+ bat_priv->algo_ops->iface.update_mac(hard_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 777aea10c..378cc1119 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -45,6 +45,7 @@
#include <linux/wait.h>
#include "hard-interface.h"
+#include "log.h"
#include "originator.h"
#include "packet.h"
#include "send.h"
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
new file mode 100644
index 000000000..56dc532f7
--- /dev/null
+++ b/net/batman-adv/log.c
@@ -0,0 +1,231 @@
+/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "log.h"
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/debugfs.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/fcntl.h>
+#include <linux/fs.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/sched.h> /* for linux/wait.h */
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/wait.h>
+#include <stdarg.h>
+
+#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1)
+
+static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN;
+
+static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log,
+ size_t idx)
+{
+ return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK];
+}
+
+static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log,
+ char c)
+{
+ char *char_addr;
+
+ char_addr = batadv_log_char_addr(debug_log, debug_log->log_end);
+ *char_addr = c;
+ debug_log->log_end++;
+
+ if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len)
+ debug_log->log_start = debug_log->log_end - batadv_log_buff_len;
+}
+
+__printf(2, 3)
+static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
+ const char *fmt, ...)
+{
+ va_list args;
+ static char debug_log_buf[256];
+ char *p;
+
+ if (!debug_log)
+ return 0;
+
+ spin_lock_bh(&debug_log->lock);
+ va_start(args, fmt);
+ vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args);
+ va_end(args);
+
+ for (p = debug_log_buf; *p != 0; p++)
+ batadv_emit_log_char(debug_log, *p);
+
+ spin_unlock_bh(&debug_log->lock);
+
+ wake_up(&debug_log->queue_wait);
+
+ return 0;
+}
+
+int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
+{
+ va_list args;
+ char tmp_log_buf[256];
+
+ va_start(args, fmt);
+ vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args);
+ batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s",
+ jiffies_to_msecs(jiffies), tmp_log_buf);
+ va_end(args);
+
+ return 0;
+}
+
+static int batadv_log_open(struct inode *inode, struct file *file)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -EBUSY;
+
+ nonseekable_open(inode, file);
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+static int batadv_log_release(struct inode *inode, struct file *file)
+{
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log)
+{
+ return !(debug_log->log_start - debug_log->log_end);
+}
+
+static ssize_t batadv_log_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct batadv_priv *bat_priv = file->private_data;
+ struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
+ int error, i = 0;
+ char *char_addr;
+ char c;
+
+ if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log))
+ return -EAGAIN;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (count == 0)
+ return 0;
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ error = wait_event_interruptible(debug_log->queue_wait,
+ (!batadv_log_empty(debug_log)));
+
+ if (error)
+ return error;
+
+ spin_lock_bh(&debug_log->lock);
+
+ while ((!error) && (i < count) &&
+ (debug_log->log_start != debug_log->log_end)) {
+ char_addr = batadv_log_char_addr(debug_log,
+ debug_log->log_start);
+ c = *char_addr;
+
+ debug_log->log_start++;
+
+ spin_unlock_bh(&debug_log->lock);
+
+ error = __put_user(c, buf);
+
+ spin_lock_bh(&debug_log->lock);
+
+ buf++;
+ i++;
+ }
+
+ spin_unlock_bh(&debug_log->lock);
+
+ if (!error)
+ return i;
+
+ return error;
+}
+
+static unsigned int batadv_log_poll(struct file *file, poll_table *wait)
+{
+ struct batadv_priv *bat_priv = file->private_data;
+ struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
+
+ poll_wait(file, &debug_log->queue_wait, wait);
+
+ if (!batadv_log_empty(debug_log))
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static const struct file_operations batadv_log_fops = {
+ .open = batadv_log_open,
+ .release = batadv_log_release,
+ .read = batadv_log_read,
+ .poll = batadv_log_poll,
+ .llseek = no_llseek,
+};
+
+int batadv_debug_log_setup(struct batadv_priv *bat_priv)
+{
+ struct dentry *d;
+
+ if (!bat_priv->debug_dir)
+ goto err;
+
+ bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC);
+ if (!bat_priv->debug_log)
+ goto err;
+
+ spin_lock_init(&bat_priv->debug_log->lock);
+ init_waitqueue_head(&bat_priv->debug_log->queue_wait);
+
+ d = debugfs_create_file("log", S_IFREG | S_IRUSR,
+ bat_priv->debug_dir, bat_priv,
+ &batadv_log_fops);
+ if (!d)
+ goto err;
+
+ return 0;
+
+err:
+ return -ENOMEM;
+}
+
+void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
+{
+ kfree(bat_priv->debug_log);
+ bat_priv->debug_log = NULL;
+}
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
new file mode 100644
index 000000000..e0e1a88c3
--- /dev/null
+++ b/net/batman-adv/log.h
@@ -0,0 +1,111 @@
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_LOG_H_
+#define _NET_BATMAN_ADV_LOG_H_
+
+#include "main.h"
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/printk.h>
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+
+int batadv_debug_log_setup(struct batadv_priv *bat_priv);
+void batadv_debug_log_cleanup(struct batadv_priv *bat_priv);
+
+#else
+
+static inline int batadv_debug_log_setup(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
+{
+}
+
+#endif
+
+/**
+ * enum batadv_dbg_level - available log levels
+ * @BATADV_DBG_BATMAN: OGM and TQ computations related messages
+ * @BATADV_DBG_ROUTES: route added / changed / deleted
+ * @BATADV_DBG_TT: translation table messages
+ * @BATADV_DBG_BLA: bridge loop avoidance messages
+ * @BATADV_DBG_DAT: ARP snooping and DAT related messages
+ * @BATADV_DBG_NC: network coding related messages
+ * @BATADV_DBG_MCAST: multicast related messages
+ * @BATADV_DBG_TP_METER: throughput meter messages
+ * @BATADV_DBG_ALL: the union of all the above log levels
+ */
+enum batadv_dbg_level {
+ BATADV_DBG_BATMAN = BIT(0),
+ BATADV_DBG_ROUTES = BIT(1),
+ BATADV_DBG_TT = BIT(2),
+ BATADV_DBG_BLA = BIT(3),
+ BATADV_DBG_DAT = BIT(4),
+ BATADV_DBG_NC = BIT(5),
+ BATADV_DBG_MCAST = BIT(6),
+ BATADV_DBG_TP_METER = BIT(7),
+ BATADV_DBG_ALL = 127,
+};
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
+__printf(2, 3);
+
+/* possibly ratelimited debug output */
+#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
+ do { \
+ if (atomic_read(&bat_priv->log_level) & type && \
+ (!ratelimited || net_ratelimit())) \
+ batadv_debug_log(bat_priv, fmt, ## arg);\
+ } \
+ while (0)
+#else /* !CONFIG_BATMAN_ADV_DEBUG */
+__printf(4, 5)
+static inline void _batadv_dbg(int type __always_unused,
+ struct batadv_priv *bat_priv __always_unused,
+ int ratelimited __always_unused,
+ const char *fmt __always_unused, ...)
+{
+}
+#endif
+
+#define batadv_dbg(type, bat_priv, arg...) \
+ _batadv_dbg(type, bat_priv, 0, ## arg)
+#define batadv_dbg_ratelimited(type, bat_priv, arg...) \
+ _batadv_dbg(type, bat_priv, 1, ## arg)
+
+#define batadv_info(net_dev, fmt, arg...) \
+ do { \
+ struct net_device *_netdev = (net_dev); \
+ struct batadv_priv *_batpriv = netdev_priv(_netdev); \
+ batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \
+ pr_info("%s: " fmt, _netdev->name, ## arg); \
+ } while (0)
+#define batadv_err(net_dev, fmt, arg...) \
+ do { \
+ struct net_device *_netdev = (net_dev); \
+ struct batadv_priv *_batpriv = netdev_priv(_netdev); \
+ batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \
+ pr_err("%s: " fmt, _netdev->name, ## arg); \
+ } while (0)
+
+#endif /* _NET_BATMAN_ADV_LOG_H_ */
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 5f2974bd1..fe4c5e29f 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -31,16 +31,13 @@
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
-#include <linux/lockdep.h>
#include <linux/module.h>
-#include <linux/moduleparam.h>
#include <linux/netdevice.h>
-#include <linux/pkt_sched.h>
+#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
#include <linux/skbuff.h>
-#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/string.h>
@@ -49,6 +46,8 @@
#include <net/rtnetlink.h>
#include "bat_algo.h"
+#include "bat_iv_ogm.h"
+#include "bat_v.h"
#include "bridge_loop_avoidance.h"
#include "debugfs.h"
#include "distributed-arp-table.h"
@@ -56,13 +55,16 @@
#include "gateway_common.h"
#include "hard-interface.h"
#include "icmp_socket.h"
+#include "log.h"
#include "multicast.h"
+#include "netlink.h"
#include "network-coding.h"
#include "originator.h"
#include "packet.h"
#include "routing.h"
#include "send.h"
#include "soft-interface.h"
+#include "tp_meter.h"
#include "translation-table.h"
/* List manipulations on hardif_list have to be rtnl_lock()'ed,
@@ -71,8 +73,6 @@
struct list_head batadv_hardif_list;
static int (*batadv_rx_handler[256])(struct sk_buff *,
struct batadv_hard_iface *);
-char batadv_routing_algo[20] = "BATMAN_IV";
-static struct hlist_head batadv_algo_list;
unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
@@ -83,13 +83,14 @@ static void batadv_recv_handler_init(void);
static int __init batadv_init(void)
{
INIT_LIST_HEAD(&batadv_hardif_list);
- INIT_HLIST_HEAD(&batadv_algo_list);
+ batadv_algo_init();
batadv_recv_handler_init();
batadv_v_init();
batadv_iv_init();
batadv_nc_init();
+ batadv_tp_meter_init();
batadv_event_workqueue = create_singlethread_workqueue("bat_events");
@@ -101,6 +102,7 @@ static int __init batadv_init(void)
register_netdevice_notifier(&batadv_hard_if_notifier);
rtnl_link_register(&batadv_link_ops);
+ batadv_netlink_register();
pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n",
BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION);
@@ -111,6 +113,7 @@ static int __init batadv_init(void)
static void __exit batadv_exit(void)
{
batadv_debugfs_destroy();
+ batadv_netlink_unregister();
rtnl_link_unregister(&batadv_link_ops);
unregister_netdevice_notifier(&batadv_hard_if_notifier);
batadv_hardif_remove_interfaces();
@@ -141,6 +144,7 @@ int batadv_mesh_init(struct net_device *soft_iface)
spin_lock_init(&bat_priv->tvlv.container_list_lock);
spin_lock_init(&bat_priv->tvlv.handler_list_lock);
spin_lock_init(&bat_priv->softif_vlan_list_lock);
+ spin_lock_init(&bat_priv->tp_list_lock);
INIT_HLIST_HEAD(&bat_priv->forw_bat_list);
INIT_HLIST_HEAD(&bat_priv->forw_bcast_list);
@@ -159,6 +163,7 @@ int batadv_mesh_init(struct net_device *soft_iface)
INIT_HLIST_HEAD(&bat_priv->tvlv.container_list);
INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list);
INIT_HLIST_HEAD(&bat_priv->softif_vlan_list);
+ INIT_HLIST_HEAD(&bat_priv->tp_list);
ret = batadv_v_mesh_init(bat_priv);
if (ret < 0)
@@ -538,78 +543,6 @@ void batadv_recv_handler_unregister(u8 packet_type)
batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet;
}
-static struct batadv_algo_ops *batadv_algo_get(char *name)
-{
- struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp;
-
- hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) {
- if (strcmp(bat_algo_ops_tmp->name, name) != 0)
- continue;
-
- bat_algo_ops = bat_algo_ops_tmp;
- break;
- }
-
- return bat_algo_ops;
-}
-
-int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
-{
- struct batadv_algo_ops *bat_algo_ops_tmp;
-
- bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name);
- if (bat_algo_ops_tmp) {
- pr_info("Trying to register already registered routing algorithm: %s\n",
- bat_algo_ops->name);
- return -EEXIST;
- }
-
- /* all algorithms must implement all ops (for now) */
- if (!bat_algo_ops->bat_iface_enable ||
- !bat_algo_ops->bat_iface_disable ||
- !bat_algo_ops->bat_iface_update_mac ||
- !bat_algo_ops->bat_primary_iface_set ||
- !bat_algo_ops->bat_ogm_schedule ||
- !bat_algo_ops->bat_ogm_emit ||
- !bat_algo_ops->bat_neigh_cmp ||
- !bat_algo_ops->bat_neigh_is_similar_or_better) {
- pr_info("Routing algo '%s' does not implement required ops\n",
- bat_algo_ops->name);
- return -EINVAL;
- }
-
- INIT_HLIST_NODE(&bat_algo_ops->list);
- hlist_add_head(&bat_algo_ops->list, &batadv_algo_list);
-
- return 0;
-}
-
-int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
-{
- struct batadv_algo_ops *bat_algo_ops;
-
- bat_algo_ops = batadv_algo_get(name);
- if (!bat_algo_ops)
- return -EINVAL;
-
- bat_priv->bat_algo_ops = bat_algo_ops;
-
- return 0;
-}
-
-int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct batadv_algo_ops *bat_algo_ops;
-
- seq_puts(seq, "Available routing algorithms:\n");
-
- hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
- seq_printf(seq, " * %s\n", bat_algo_ops->name);
- }
-
- return 0;
-}
-
/**
* batadv_skb_crc32 - calculate CRC32 of the whole packet and skip bytes in
* the header
@@ -644,594 +577,6 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
}
/**
- * batadv_tvlv_handler_release - release tvlv handler from lists and queue for
- * free after rcu grace period
- * @ref: kref pointer of the tvlv
- */
-static void batadv_tvlv_handler_release(struct kref *ref)
-{
- struct batadv_tvlv_handler *tvlv_handler;
-
- tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount);
- kfree_rcu(tvlv_handler, rcu);
-}
-
-/**
- * batadv_tvlv_handler_put - decrement the tvlv container refcounter and
- * possibly release it
- * @tvlv_handler: the tvlv handler to free
- */
-static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler)
-{
- kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release);
-}
-
-/**
- * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list
- * based on the provided type and version (both need to match)
- * @bat_priv: the bat priv with all the soft interface information
- * @type: tvlv handler type to look for
- * @version: tvlv handler version to look for
- *
- * Return: tvlv handler if found or NULL otherwise.
- */
-static struct batadv_tvlv_handler *
-batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
-{
- struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(tvlv_handler_tmp,
- &bat_priv->tvlv.handler_list, list) {
- if (tvlv_handler_tmp->type != type)
- continue;
-
- if (tvlv_handler_tmp->version != version)
- continue;
-
- if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount))
- continue;
-
- tvlv_handler = tvlv_handler_tmp;
- break;
- }
- rcu_read_unlock();
-
- return tvlv_handler;
-}
-
-/**
- * batadv_tvlv_container_release - release tvlv from lists and free
- * @ref: kref pointer of the tvlv
- */
-static void batadv_tvlv_container_release(struct kref *ref)
-{
- struct batadv_tvlv_container *tvlv;
-
- tvlv = container_of(ref, struct batadv_tvlv_container, refcount);
- kfree(tvlv);
-}
-
-/**
- * batadv_tvlv_container_put - decrement the tvlv container refcounter and
- * possibly release it
- * @tvlv: the tvlv container to free
- */
-static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv)
-{
- kref_put(&tvlv->refcount, batadv_tvlv_container_release);
-}
-
-/**
- * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container
- * list based on the provided type and version (both need to match)
- * @bat_priv: the bat priv with all the soft interface information
- * @type: tvlv container type to look for
- * @version: tvlv container version to look for
- *
- * Has to be called with the appropriate locks being acquired
- * (tvlv.container_list_lock).
- *
- * Return: tvlv container if found or NULL otherwise.
- */
-static struct batadv_tvlv_container *
-batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
-{
- struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL;
-
- lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
-
- hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) {
- if (tvlv_tmp->tvlv_hdr.type != type)
- continue;
-
- if (tvlv_tmp->tvlv_hdr.version != version)
- continue;
-
- kref_get(&tvlv_tmp->refcount);
- tvlv = tvlv_tmp;
- break;
- }
-
- return tvlv;
-}
-
-/**
- * batadv_tvlv_container_list_size - calculate the size of the tvlv container
- * list entries
- * @bat_priv: the bat priv with all the soft interface information
- *
- * Has to be called with the appropriate locks being acquired
- * (tvlv.container_list_lock).
- *
- * Return: size of all currently registered tvlv containers in bytes.
- */
-static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
-{
- struct batadv_tvlv_container *tvlv;
- u16 tvlv_len = 0;
-
- lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
-
- hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
- tvlv_len += sizeof(struct batadv_tvlv_hdr);
- tvlv_len += ntohs(tvlv->tvlv_hdr.len);
- }
-
- return tvlv_len;
-}
-
-/**
- * batadv_tvlv_container_remove - remove tvlv container from the tvlv container
- * list
- * @bat_priv: the bat priv with all the soft interface information
- * @tvlv: the to be removed tvlv container
- *
- * Has to be called with the appropriate locks being acquired
- * (tvlv.container_list_lock).
- */
-static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
- struct batadv_tvlv_container *tvlv)
-{
- lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
-
- if (!tvlv)
- return;
-
- hlist_del(&tvlv->list);
-
- /* first call to decrement the counter, second call to free */
- batadv_tvlv_container_put(tvlv);
- batadv_tvlv_container_put(tvlv);
-}
-
-/**
- * batadv_tvlv_container_unregister - unregister tvlv container based on the
- * provided type and version (both need to match)
- * @bat_priv: the bat priv with all the soft interface information
- * @type: tvlv container type to unregister
- * @version: tvlv container type to unregister
- */
-void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
- u8 type, u8 version)
-{
- struct batadv_tvlv_container *tvlv;
-
- spin_lock_bh(&bat_priv->tvlv.container_list_lock);
- tvlv = batadv_tvlv_container_get(bat_priv, type, version);
- batadv_tvlv_container_remove(bat_priv, tvlv);
- spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
-}
-
-/**
- * batadv_tvlv_container_register - register tvlv type, version and content
- * to be propagated with each (primary interface) OGM
- * @bat_priv: the bat priv with all the soft interface information
- * @type: tvlv container type
- * @version: tvlv container version
- * @tvlv_value: tvlv container content
- * @tvlv_value_len: tvlv container content length
- *
- * If a container of the same type and version was already registered the new
- * content is going to replace the old one.
- */
-void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
- u8 type, u8 version,
- void *tvlv_value, u16 tvlv_value_len)
-{
- struct batadv_tvlv_container *tvlv_old, *tvlv_new;
-
- if (!tvlv_value)
- tvlv_value_len = 0;
-
- tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC);
- if (!tvlv_new)
- return;
-
- tvlv_new->tvlv_hdr.version = version;
- tvlv_new->tvlv_hdr.type = type;
- tvlv_new->tvlv_hdr.len = htons(tvlv_value_len);
-
- memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len));
- INIT_HLIST_NODE(&tvlv_new->list);
- kref_init(&tvlv_new->refcount);
-
- spin_lock_bh(&bat_priv->tvlv.container_list_lock);
- tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
- batadv_tvlv_container_remove(bat_priv, tvlv_old);
- hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list);
- spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
-}
-
-/**
- * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate
- * requested packet size
- * @packet_buff: packet buffer
- * @packet_buff_len: packet buffer size
- * @min_packet_len: requested packet minimum size
- * @additional_packet_len: requested additional packet size on top of minimum
- * size
- *
- * Return: true of the packet buffer could be changed to the requested size,
- * false otherwise.
- */
-static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
- int *packet_buff_len,
- int min_packet_len,
- int additional_packet_len)
-{
- unsigned char *new_buff;
-
- new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC);
-
- /* keep old buffer if kmalloc should fail */
- if (!new_buff)
- return false;
-
- memcpy(new_buff, *packet_buff, min_packet_len);
- kfree(*packet_buff);
- *packet_buff = new_buff;
- *packet_buff_len = min_packet_len + additional_packet_len;
-
- return true;
-}
-
-/**
- * batadv_tvlv_container_ogm_append - append tvlv container content to given
- * OGM packet buffer
- * @bat_priv: the bat priv with all the soft interface information
- * @packet_buff: ogm packet buffer
- * @packet_buff_len: ogm packet buffer size including ogm header and tvlv
- * content
- * @packet_min_len: ogm header size to be preserved for the OGM itself
- *
- * The ogm packet might be enlarged or shrunk depending on the current size
- * and the size of the to-be-appended tvlv containers.
- *
- * Return: size of all appended tvlv containers in bytes.
- */
-u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
- unsigned char **packet_buff,
- int *packet_buff_len, int packet_min_len)
-{
- struct batadv_tvlv_container *tvlv;
- struct batadv_tvlv_hdr *tvlv_hdr;
- u16 tvlv_value_len;
- void *tvlv_value;
- bool ret;
-
- spin_lock_bh(&bat_priv->tvlv.container_list_lock);
- tvlv_value_len = batadv_tvlv_container_list_size(bat_priv);
-
- ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len,
- packet_min_len, tvlv_value_len);
-
- if (!ret)
- goto end;
-
- if (!tvlv_value_len)
- goto end;
-
- tvlv_value = (*packet_buff) + packet_min_len;
-
- hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
- tvlv_hdr = tvlv_value;
- tvlv_hdr->type = tvlv->tvlv_hdr.type;
- tvlv_hdr->version = tvlv->tvlv_hdr.version;
- tvlv_hdr->len = tvlv->tvlv_hdr.len;
- tvlv_value = tvlv_hdr + 1;
- memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len));
- tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len);
- }
-
-end:
- spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
- return tvlv_value_len;
-}
-
-/**
- * batadv_tvlv_call_handler - parse the given tvlv buffer to call the
- * appropriate handlers
- * @bat_priv: the bat priv with all the soft interface information
- * @tvlv_handler: tvlv callback function handling the tvlv content
- * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
- * @orig_node: orig node emitting the ogm packet
- * @src: source mac address of the unicast packet
- * @dst: destination mac address of the unicast packet
- * @tvlv_value: tvlv content
- * @tvlv_value_len: tvlv content length
- *
- * Return: success if handler was not found or the return value of the handler
- * callback.
- */
-static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
- struct batadv_tvlv_handler *tvlv_handler,
- bool ogm_source,
- struct batadv_orig_node *orig_node,
- u8 *src, u8 *dst,
- void *tvlv_value, u16 tvlv_value_len)
-{
- if (!tvlv_handler)
- return NET_RX_SUCCESS;
-
- if (ogm_source) {
- if (!tvlv_handler->ogm_handler)
- return NET_RX_SUCCESS;
-
- if (!orig_node)
- return NET_RX_SUCCESS;
-
- tvlv_handler->ogm_handler(bat_priv, orig_node,
- BATADV_NO_FLAGS,
- tvlv_value, tvlv_value_len);
- tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED;
- } else {
- if (!src)
- return NET_RX_SUCCESS;
-
- if (!dst)
- return NET_RX_SUCCESS;
-
- if (!tvlv_handler->unicast_handler)
- return NET_RX_SUCCESS;
-
- return tvlv_handler->unicast_handler(bat_priv, src,
- dst, tvlv_value,
- tvlv_value_len);
- }
-
- return NET_RX_SUCCESS;
-}
-
-/**
- * batadv_tvlv_containers_process - parse the given tvlv buffer to call the
- * appropriate handlers
- * @bat_priv: the bat priv with all the soft interface information
- * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
- * @orig_node: orig node emitting the ogm packet
- * @src: source mac address of the unicast packet
- * @dst: destination mac address of the unicast packet
- * @tvlv_value: tvlv content
- * @tvlv_value_len: tvlv content length
- *
- * Return: success when processing an OGM or the return value of all called
- * handler callbacks.
- */
-int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
- bool ogm_source,
- struct batadv_orig_node *orig_node,
- u8 *src, u8 *dst,
- void *tvlv_value, u16 tvlv_value_len)
-{
- struct batadv_tvlv_handler *tvlv_handler;
- struct batadv_tvlv_hdr *tvlv_hdr;
- u16 tvlv_value_cont_len;
- u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND;
- int ret = NET_RX_SUCCESS;
-
- while (tvlv_value_len >= sizeof(*tvlv_hdr)) {
- tvlv_hdr = tvlv_value;
- tvlv_value_cont_len = ntohs(tvlv_hdr->len);
- tvlv_value = tvlv_hdr + 1;
- tvlv_value_len -= sizeof(*tvlv_hdr);
-
- if (tvlv_value_cont_len > tvlv_value_len)
- break;
-
- tvlv_handler = batadv_tvlv_handler_get(bat_priv,
- tvlv_hdr->type,
- tvlv_hdr->version);
-
- ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler,
- ogm_source, orig_node,
- src, dst, tvlv_value,
- tvlv_value_cont_len);
- if (tvlv_handler)
- batadv_tvlv_handler_put(tvlv_handler);
- tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len;
- tvlv_value_len -= tvlv_value_cont_len;
- }
-
- if (!ogm_source)
- return ret;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(tvlv_handler,
- &bat_priv->tvlv.handler_list, list) {
- if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) &&
- !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED))
- tvlv_handler->ogm_handler(bat_priv, orig_node,
- cifnotfound, NULL, 0);
-
- tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED;
- }
- rcu_read_unlock();
-
- return NET_RX_SUCCESS;
-}
-
-/**
- * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate
- * handlers
- * @bat_priv: the bat priv with all the soft interface information
- * @batadv_ogm_packet: ogm packet containing the tvlv containers
- * @orig_node: orig node emitting the ogm packet
- */
-void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
- struct batadv_ogm_packet *batadv_ogm_packet,
- struct batadv_orig_node *orig_node)
-{
- void *tvlv_value;
- u16 tvlv_value_len;
-
- if (!batadv_ogm_packet)
- return;
-
- tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len);
- if (!tvlv_value_len)
- return;
-
- tvlv_value = batadv_ogm_packet + 1;
-
- batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL,
- tvlv_value, tvlv_value_len);
-}
-
-/**
- * batadv_tvlv_handler_register - register tvlv handler based on the provided
- * type and version (both need to match) for ogm tvlv payload and/or unicast
- * payload
- * @bat_priv: the bat priv with all the soft interface information
- * @optr: ogm tvlv handler callback function. This function receives the orig
- * node, flags and the tvlv content as argument to process.
- * @uptr: unicast tvlv handler callback function. This function receives the
- * source & destination of the unicast packet as well as the tvlv content
- * to process.
- * @type: tvlv handler type to be registered
- * @version: tvlv handler version to be registered
- * @flags: flags to enable or disable TVLV API behavior
- */
-void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
- void (*optr)(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig,
- u8 flags,
- void *tvlv_value,
- u16 tvlv_value_len),
- int (*uptr)(struct batadv_priv *bat_priv,
- u8 *src, u8 *dst,
- void *tvlv_value,
- u16 tvlv_value_len),
- u8 type, u8 version, u8 flags)
-{
- struct batadv_tvlv_handler *tvlv_handler;
-
- tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
- if (tvlv_handler) {
- batadv_tvlv_handler_put(tvlv_handler);
- return;
- }
-
- tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC);
- if (!tvlv_handler)
- return;
-
- tvlv_handler->ogm_handler = optr;
- tvlv_handler->unicast_handler = uptr;
- tvlv_handler->type = type;
- tvlv_handler->version = version;
- tvlv_handler->flags = flags;
- kref_init(&tvlv_handler->refcount);
- INIT_HLIST_NODE(&tvlv_handler->list);
-
- spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
- hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list);
- spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
-}
-
-/**
- * batadv_tvlv_handler_unregister - unregister tvlv handler based on the
- * provided type and version (both need to match)
- * @bat_priv: the bat priv with all the soft interface information
- * @type: tvlv handler type to be unregistered
- * @version: tvlv handler version to be unregistered
- */
-void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
- u8 type, u8 version)
-{
- struct batadv_tvlv_handler *tvlv_handler;
-
- tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
- if (!tvlv_handler)
- return;
-
- batadv_tvlv_handler_put(tvlv_handler);
- spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
- hlist_del_rcu(&tvlv_handler->list);
- spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
- batadv_tvlv_handler_put(tvlv_handler);
-}
-
-/**
- * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the
- * specified host
- * @bat_priv: the bat priv with all the soft interface information
- * @src: source mac address of the unicast packet
- * @dst: destination mac address of the unicast packet
- * @type: tvlv type
- * @version: tvlv version
- * @tvlv_value: tvlv content
- * @tvlv_value_len: tvlv content length
- */
-void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
- u8 *dst, u8 type, u8 version,
- void *tvlv_value, u16 tvlv_value_len)
-{
- struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
- struct batadv_tvlv_hdr *tvlv_hdr;
- struct batadv_orig_node *orig_node;
- struct sk_buff *skb;
- unsigned char *tvlv_buff;
- unsigned int tvlv_len;
- ssize_t hdr_len = sizeof(*unicast_tvlv_packet);
-
- orig_node = batadv_orig_hash_find(bat_priv, dst);
- if (!orig_node)
- return;
-
- tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len;
-
- skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len);
- if (!skb)
- goto out;
-
- skb->priority = TC_PRIO_CONTROL;
- skb_reserve(skb, ETH_HLEN);
- tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len);
- unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff;
- unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV;
- unicast_tvlv_packet->version = BATADV_COMPAT_VERSION;
- unicast_tvlv_packet->ttl = BATADV_TTL;
- unicast_tvlv_packet->reserved = 0;
- unicast_tvlv_packet->tvlv_len = htons(tvlv_len);
- unicast_tvlv_packet->align = 0;
- ether_addr_copy(unicast_tvlv_packet->src, src);
- ether_addr_copy(unicast_tvlv_packet->dst, dst);
-
- tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1);
- tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff;
- tvlv_hdr->version = version;
- tvlv_hdr->type = type;
- tvlv_hdr->len = htons(tvlv_value_len);
- tvlv_buff += sizeof(*tvlv_hdr);
- memcpy(tvlv_buff, tvlv_value, tvlv_value_len);
-
- if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP)
- kfree_skb(skb);
-out:
- batadv_orig_node_put(orig_node);
-}
-
-/**
* batadv_get_vid - extract the VLAN identifier from skb if any
* @skb: the buffer containing the packet
* @header_len: length of the batman header preceding the ethernet header
@@ -1284,36 +629,6 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid)
return ap_isolation_enabled;
}
-static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
-{
- struct batadv_algo_ops *bat_algo_ops;
- char *algo_name = (char *)val;
- size_t name_len = strlen(algo_name);
-
- if (name_len > 0 && algo_name[name_len - 1] == '\n')
- algo_name[name_len - 1] = '\0';
-
- bat_algo_ops = batadv_algo_get(algo_name);
- if (!bat_algo_ops) {
- pr_err("Routing algorithm '%s' is not supported\n", algo_name);
- return -EINVAL;
- }
-
- return param_set_copystring(algo_name, kp);
-}
-
-static const struct kernel_param_ops batadv_param_ops_ra = {
- .set = batadv_param_set_ra,
- .get = param_get_string,
-};
-
-static struct kparam_string batadv_param_string_ra = {
- .maxlen = sizeof(batadv_routing_algo),
- .string = batadv_routing_algo,
-};
-
-module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra,
- 0644);
module_init(batadv_init);
module_exit(batadv_exit);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 76925266d..06a860845 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2016.2"
+#define BATADV_SOURCE_VERSION "2016.3"
#endif
/* B.A.T.M.A.N. parameters */
@@ -100,6 +100,9 @@
#define BATADV_NUM_BCASTS_WIRELESS 3
#define BATADV_NUM_BCASTS_MAX 3
+/* length of the single packet used by the TP meter */
+#define BATADV_TP_PACKET_LEN ETH_DATA_LEN
+
/* msecs after which an ARP_REQUEST is sent in broadcast as fallback */
#define ARP_REQ_DELAY 250
/* numbers of originator to contact for any PUT/GET DHT operation */
@@ -131,6 +134,11 @@
#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */
+/**
+ * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions
+ */
+#define BATADV_TP_MAX_NUM 5
+
enum batadv_mesh_state {
BATADV_MESH_INACTIVE,
BATADV_MESH_ACTIVE,
@@ -175,29 +183,26 @@ enum batadv_uev_type {
/* Kernel headers */
-#include <linux/atomic.h>
#include <linux/bitops.h> /* for packet.h */
#include <linux/compiler.h>
#include <linux/cpumask.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h> /* for packet.h */
-#include <linux/netdevice.h>
-#include <linux/printk.h>
-#include <linux/types.h>
-#include <linux/percpu.h>
-#include <linux/jiffies.h>
#include <linux/if_vlan.h>
+#include <linux/jiffies.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
#include "types.h"
-struct batadv_ogm_packet;
+struct net_device;
+struct packet_type;
struct seq_file;
struct sk_buff;
#define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \
(int)(vid & VLAN_VID_MASK) : -1)
-extern char batadv_routing_algo[];
extern struct list_head batadv_hardif_list;
extern unsigned char batadv_broadcast_addr[];
@@ -218,74 +223,9 @@ batadv_recv_handler_register(u8 packet_type,
int (*recv_handler)(struct sk_buff *,
struct batadv_hard_iface *));
void batadv_recv_handler_unregister(u8 packet_type);
-int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
-int batadv_algo_select(struct batadv_priv *bat_priv, char *name);
-int batadv_algo_seq_print_text(struct seq_file *seq, void *offset);
__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr);
/**
- * enum batadv_dbg_level - available log levels
- * @BATADV_DBG_BATMAN: OGM and TQ computations related messages
- * @BATADV_DBG_ROUTES: route added / changed / deleted
- * @BATADV_DBG_TT: translation table messages
- * @BATADV_DBG_BLA: bridge loop avoidance messages
- * @BATADV_DBG_DAT: ARP snooping and DAT related messages
- * @BATADV_DBG_NC: network coding related messages
- * @BATADV_DBG_ALL: the union of all the above log levels
- */
-enum batadv_dbg_level {
- BATADV_DBG_BATMAN = BIT(0),
- BATADV_DBG_ROUTES = BIT(1),
- BATADV_DBG_TT = BIT(2),
- BATADV_DBG_BLA = BIT(3),
- BATADV_DBG_DAT = BIT(4),
- BATADV_DBG_NC = BIT(5),
- BATADV_DBG_ALL = 63,
-};
-
-#ifdef CONFIG_BATMAN_ADV_DEBUG
-int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
-__printf(2, 3);
-
-/* possibly ratelimited debug output */
-#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
- do { \
- if (atomic_read(&bat_priv->log_level) & type && \
- (!ratelimited || net_ratelimit())) \
- batadv_debug_log(bat_priv, fmt, ## arg);\
- } \
- while (0)
-#else /* !CONFIG_BATMAN_ADV_DEBUG */
-__printf(4, 5)
-static inline void _batadv_dbg(int type __always_unused,
- struct batadv_priv *bat_priv __always_unused,
- int ratelimited __always_unused,
- const char *fmt __always_unused, ...)
-{
-}
-#endif
-
-#define batadv_dbg(type, bat_priv, arg...) \
- _batadv_dbg(type, bat_priv, 0, ## arg)
-#define batadv_dbg_ratelimited(type, bat_priv, arg...) \
- _batadv_dbg(type, bat_priv, 1, ## arg)
-
-#define batadv_info(net_dev, fmt, arg...) \
- do { \
- struct net_device *_netdev = (net_dev); \
- struct batadv_priv *_batpriv = netdev_priv(_netdev); \
- batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \
- pr_info("%s: " fmt, _netdev->name, ## arg); \
- } while (0)
-#define batadv_err(net_dev, fmt, arg...) \
- do { \
- struct net_device *_netdev = (net_dev); \
- struct batadv_priv *_batpriv = netdev_priv(_netdev); \
- batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \
- pr_err("%s: " fmt, _netdev->name, ## arg); \
- } while (0)
-
-/**
* batadv_compare_eth - Compare two not u16 aligned Ethernet addresses
* @data1: Pointer to a six-byte array containing the Ethernet address
* @data2: Pointer other six-byte array containing the Ethernet address
@@ -370,39 +310,6 @@ static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
*/
#define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0]))
-void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
- u8 type, u8 version,
- void *tvlv_value, u16 tvlv_value_len);
-u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
- unsigned char **packet_buff,
- int *packet_buff_len, int packet_min_len);
-void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
- struct batadv_ogm_packet *batadv_ogm_packet,
- struct batadv_orig_node *orig_node);
-void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
- u8 type, u8 version);
-
-void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
- void (*optr)(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig,
- u8 flags,
- void *tvlv_value,
- u16 tvlv_value_len),
- int (*uptr)(struct batadv_priv *bat_priv,
- u8 *src, u8 *dst,
- void *tvlv_value,
- u16 tvlv_value_len),
- u8 type, u8 version, u8 flags);
-void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
- u8 type, u8 version);
-int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
- bool ogm_source,
- struct batadv_orig_node *orig_node,
- u8 *src, u8 *dst,
- void *tvlv_buff, u16 tvlv_buff_len);
-void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
- u8 *dst, u8 type, u8 version,
- void *tvlv_value, u16 tvlv_value_len);
unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len);
bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid);
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index c32f24faf..cc915073a 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -25,17 +25,23 @@
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/fs.h>
+#include <linux/icmpv6.h>
+#include <linux/if_bridge.h>
#include <linux/if_ether.h>
-#include <linux/in6.h>
+#include <linux/igmp.h>
#include <linux/in.h>
+#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
+#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -43,18 +49,57 @@
#include <linux/string.h>
#include <linux/types.h>
#include <net/addrconf.h>
+#include <net/if_inet6.h>
+#include <net/ip.h>
#include <net/ipv6.h>
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
#include "packet.h"
#include "translation-table.h"
+#include "tvlv.h"
+
+/**
+ * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists
+ * @soft_iface: netdev struct of the mesh interface
+ *
+ * If the given soft interface has a bridge on top then the refcount
+ * of the according net device is increased.
+ *
+ * Return: NULL if no such bridge exists. Otherwise the net device of the
+ * bridge.
+ */
+static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface)
+{
+ struct net_device *upper = soft_iface;
+
+ rcu_read_lock();
+ do {
+ upper = netdev_master_upper_dev_get_rcu(upper);
+ } while (upper && !(upper->priv_flags & IFF_EBRIDGE));
+
+ if (upper)
+ dev_hold(upper);
+ rcu_read_unlock();
+
+ return upper;
+}
/**
* batadv_mcast_mla_softif_get - get softif multicast listeners
* @dev: the device to collect multicast addresses from
* @mcast_list: a list to put found addresses into
*
- * Collect multicast addresses of the local multicast listeners
- * on the given soft interface, dev, in the given mcast_list.
+ * Collects multicast addresses of multicast listeners residing
+ * on this kernel on the given soft interface, dev, in
+ * the given mcast_list. In general, multicast listeners provided by
+ * your multicast receiving applications run directly on this node.
+ *
+ * If there is a bridge interface on top of dev, collects from that one
+ * instead. Just like with IP addresses and routes, multicast listeners
+ * will(/should) register to the bridge interface instead of an
+ * enslaved bat0.
*
* Return: -ENOMEM on memory allocation error or the number of
* items added to the mcast_list otherwise.
@@ -62,12 +107,13 @@
static int batadv_mcast_mla_softif_get(struct net_device *dev,
struct hlist_head *mcast_list)
{
+ struct net_device *bridge = batadv_mcast_get_bridge(dev);
struct netdev_hw_addr *mc_list_entry;
struct batadv_hw_addr *new;
int ret = 0;
- netif_addr_lock_bh(dev);
- netdev_for_each_mc_addr(mc_list_entry, dev) {
+ netif_addr_lock_bh(bridge ? bridge : dev);
+ netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) {
new = kmalloc(sizeof(*new), GFP_ATOMIC);
if (!new) {
ret = -ENOMEM;
@@ -78,7 +124,10 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev,
hlist_add_head(&new->list, mcast_list);
ret++;
}
- netif_addr_unlock_bh(dev);
+ netif_addr_unlock_bh(bridge ? bridge : dev);
+
+ if (bridge)
+ dev_put(bridge);
return ret;
}
@@ -104,6 +153,83 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
}
/**
+ * batadv_mcast_mla_br_addr_cpy - copy a bridge multicast address
+ * @dst: destination to write to - a multicast MAC address
+ * @src: source to read from - a multicast IP address
+ *
+ * Converts a given multicast IPv4/IPv6 address from a bridge
+ * to its matching multicast MAC address and copies it into the given
+ * destination buffer.
+ *
+ * Caller needs to make sure the destination buffer can hold
+ * at least ETH_ALEN bytes.
+ */
+static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
+{
+ if (src->proto == htons(ETH_P_IP))
+ ip_eth_mc_map(src->u.ip4, dst);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (src->proto == htons(ETH_P_IPV6))
+ ipv6_eth_mc_map(&src->u.ip6, dst);
+#endif
+ else
+ eth_zero_addr(dst);
+}
+
+/**
+ * batadv_mcast_mla_bridge_get - get bridged-in multicast listeners
+ * @dev: a bridge slave whose bridge to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ *
+ * Collects multicast addresses of multicast listeners residing
+ * on foreign, non-mesh devices which we gave access to our mesh via
+ * a bridge on top of the given soft interface, dev, in the given
+ * mcast_list.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
+ */
+static int batadv_mcast_mla_bridge_get(struct net_device *dev,
+ struct hlist_head *mcast_list)
+{
+ struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list);
+ struct br_ip_list *br_ip_entry, *tmp;
+ struct batadv_hw_addr *new;
+ u8 mcast_addr[ETH_ALEN];
+ int ret;
+
+ /* we don't need to detect these devices/listeners, the IGMP/MLD
+ * snooping code of the Linux bridge already does that for us
+ */
+ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list);
+ if (ret < 0)
+ goto out;
+
+ list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) {
+ batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr);
+ if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
+ continue;
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ether_addr_copy(new->addr, mcast_addr);
+ hlist_add_head(&new->list, mcast_list);
+ }
+
+out:
+ list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) {
+ list_del(&br_ip_entry->list);
+ kfree(br_ip_entry);
+ }
+
+ return ret;
+}
+
+/**
* batadv_mcast_mla_list_free - free a list of multicast addresses
* @bat_priv: the bat priv with all the soft interface information
* @mcast_list: the list to free
@@ -214,44 +340,195 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
}
/**
+ * batadv_mcast_querier_log - debug output regarding the querier status on link
+ * @bat_priv: the bat priv with all the soft interface information
+ * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD")
+ * @old_state: the previous querier state on our link
+ * @new_state: the new querier state on our link
+ *
+ * Outputs debug messages to the logging facility with log level 'mcast'
+ * regarding changes to the querier status on the link which are relevant
+ * to our multicast optimizations.
+ *
+ * Usually this is about whether a querier appeared or vanished in
+ * our mesh or whether the querier is in the suboptimal position of being
+ * behind our local bridge segment: Snooping switches will directly
+ * forward listener reports to the querier, therefore batman-adv and
+ * the bridge will potentially not see these listeners - the querier is
+ * potentially shadowing listeners from us then.
+ *
+ * This is only interesting for nodes with a bridge on top of their
+ * soft interface.
+ */
+static void
+batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
+ struct batadv_mcast_querier_state *old_state,
+ struct batadv_mcast_querier_state *new_state)
+{
+ if (!old_state->exists && new_state->exists)
+ batadv_info(bat_priv->soft_iface, "%s Querier appeared\n",
+ str_proto);
+ else if (old_state->exists && !new_state->exists)
+ batadv_info(bat_priv->soft_iface,
+ "%s Querier disappeared - multicast optimizations disabled\n",
+ str_proto);
+ else if (!bat_priv->mcast.bridged && !new_state->exists)
+ batadv_info(bat_priv->soft_iface,
+ "No %s Querier present - multicast optimizations disabled\n",
+ str_proto);
+
+ if (new_state->exists) {
+ if ((!old_state->shadowing && new_state->shadowing) ||
+ (!old_state->exists && new_state->shadowing))
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "%s Querier is behind our bridged segment: Might shadow listeners\n",
+ str_proto);
+ else if (old_state->shadowing && !new_state->shadowing)
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "%s Querier is not behind our bridged segment\n",
+ str_proto);
+ }
+}
+
+/**
+ * batadv_mcast_bridge_log - debug output for topology changes in bridged setups
+ * @bat_priv: the bat priv with all the soft interface information
+ * @bridged: a flag about whether the soft interface is currently bridged or not
+ * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier
+ * @querier_ipv6: (maybe) new status of a potential, selected MLD querier
+ *
+ * If no bridges are ever used on this node, then this function does nothing.
+ *
+ * Otherwise this function outputs debug information to the 'mcast' log level
+ * which might be relevant to our multicast optimizations.
+ *
+ * More precisely, it outputs information when a bridge interface is added or
+ * removed from a soft interface. And when a bridge is present, it further
+ * outputs information about the querier state which is relevant for the
+ * multicast flags this node is going to set.
+ */
+static void
+batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged,
+ struct batadv_mcast_querier_state *querier_ipv4,
+ struct batadv_mcast_querier_state *querier_ipv6)
+{
+ if (!bat_priv->mcast.bridged && bridged)
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "Bridge added: Setting Unsnoopables(U)-flag\n");
+ else if (bat_priv->mcast.bridged && !bridged)
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "Bridge removed: Unsetting Unsnoopables(U)-flag\n");
+
+ if (bridged) {
+ batadv_mcast_querier_log(bat_priv, "IGMP",
+ &bat_priv->mcast.querier_ipv4,
+ querier_ipv4);
+ batadv_mcast_querier_log(bat_priv, "MLD",
+ &bat_priv->mcast.querier_ipv6,
+ querier_ipv6);
+ }
+}
+
+/**
+ * batadv_mcast_flags_logs - output debug information about mcast flag changes
+ * @bat_priv: the bat priv with all the soft interface information
+ * @flags: flags indicating the new multicast state
+ *
+ * Whenever the multicast flags this nodes announces changes (@mcast_flags vs.
+ * bat_priv->mcast.flags), this notifies userspace via the 'mcast' log level.
+ */
+static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags)
+{
+ u8 old_flags = bat_priv->mcast.flags;
+ char str_old_flags[] = "[...]";
+
+ sprintf(str_old_flags, "[%c%c%c]",
+ (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
+ (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
+ (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "Changing multicast flags from '%s' to '[%c%c%c]'\n",
+ bat_priv->mcast.enabled ? str_old_flags : "<undefined>",
+ (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
+ (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
+ (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+}
+
+/**
* batadv_mcast_mla_tvlv_update - update multicast tvlv
* @bat_priv: the bat priv with all the soft interface information
*
* Updates the own multicast tvlv with our current multicast related settings,
* capabilities and inabilities.
*
- * Return: true if the tvlv container is registered afterwards. Otherwise
- * returns false.
+ * Return: false if we want all IPv4 && IPv6 multicast traffic and true
+ * otherwise.
*/
static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv)
{
struct batadv_tvlv_mcast_data mcast_data;
+ struct batadv_mcast_querier_state querier4 = {false, false};
+ struct batadv_mcast_querier_state querier6 = {false, false};
+ struct net_device *dev = bat_priv->soft_iface;
+ bool bridged;
mcast_data.flags = BATADV_NO_FLAGS;
memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved));
- /* Avoid attaching MLAs, if there is a bridge on top of our soft
- * interface, we don't support that yet (TODO)
+ bridged = batadv_mcast_has_bridge(bat_priv);
+ if (!bridged)
+ goto update;
+
+#if !IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)
+ pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n");
+#endif
+
+ querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP);
+ querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP);
+
+ querier6.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6);
+ querier6.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6);
+
+ mcast_data.flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES;
+
+ /* 1) If no querier exists at all, then multicast listeners on
+ * our local TT clients behind the bridge will keep silent.
+ * 2) If the selected querier is on one of our local TT clients,
+ * behind the bridge, then this querier might shadow multicast
+ * listeners on our local TT clients, behind this bridge.
+ *
+ * In both cases, we will signalize other batman nodes that
+ * we need all multicast traffic of the according protocol.
*/
- if (batadv_mcast_has_bridge(bat_priv)) {
- if (bat_priv->mcast.enabled) {
- batadv_tvlv_container_unregister(bat_priv,
- BATADV_TVLV_MCAST, 1);
- bat_priv->mcast.enabled = false;
- }
+ if (!querier4.exists || querier4.shadowing)
+ mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV4;
- return false;
- }
+ if (!querier6.exists || querier6.shadowing)
+ mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6;
+
+update:
+ batadv_mcast_bridge_log(bat_priv, bridged, &querier4, &querier6);
+
+ bat_priv->mcast.querier_ipv4.exists = querier4.exists;
+ bat_priv->mcast.querier_ipv4.shadowing = querier4.shadowing;
+
+ bat_priv->mcast.querier_ipv6.exists = querier6.exists;
+ bat_priv->mcast.querier_ipv6.shadowing = querier6.shadowing;
+
+ bat_priv->mcast.bridged = bridged;
if (!bat_priv->mcast.enabled ||
mcast_data.flags != bat_priv->mcast.flags) {
- batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 1,
+ batadv_mcast_flags_log(bat_priv, mcast_data.flags);
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2,
&mcast_data, sizeof(mcast_data));
bat_priv->mcast.flags = mcast_data.flags;
bat_priv->mcast.enabled = true;
}
- return true;
+ return !(mcast_data.flags &
+ (BATADV_MCAST_WANT_ALL_IPV4 + BATADV_MCAST_WANT_ALL_IPV6));
}
/**
@@ -274,6 +551,10 @@ void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
if (ret < 0)
goto out;
+ ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list);
+ if (ret < 0)
+ goto out;
+
update:
batadv_mcast_mla_tt_retract(bat_priv, &mcast_list);
batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
@@ -283,6 +564,31 @@ out:
}
/**
+ * batadv_mcast_is_report_ipv4 - check for IGMP reports
+ * @skb: the ethernet frame destined for the mesh
+ *
+ * This call might reallocate skb data.
+ *
+ * Checks whether the given frame is a valid IGMP report.
+ *
+ * Return: If so then true, otherwise false.
+ */
+static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb)
+{
+ if (ip_mc_check_igmp(skb, NULL) < 0)
+ return false;
+
+ switch (igmp_hdr(skb)->type) {
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ return true;
+ }
+
+ return false;
+}
+
+/**
* batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential
* @bat_priv: the bat priv with all the soft interface information
* @skb: the IPv4 packet to check
@@ -304,6 +610,9 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr)))
return -ENOMEM;
+ if (batadv_mcast_is_report_ipv4(skb))
+ return -EINVAL;
+
iphdr = ip_hdr(skb);
/* TODO: Implement Multicast Router Discovery (RFC4286),
@@ -320,6 +629,31 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
return 0;
}
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * batadv_mcast_is_report_ipv6 - check for MLD reports
+ * @skb: the ethernet frame destined for the mesh
+ *
+ * This call might reallocate skb data.
+ *
+ * Checks whether the given frame is a valid MLD report.
+ *
+ * Return: If so then true, otherwise false.
+ */
+static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb)
+{
+ if (ipv6_mc_check_mld(skb, NULL) < 0)
+ return false;
+
+ switch (icmp6_hdr(skb)->icmp6_type) {
+ case ICMPV6_MGM_REPORT:
+ case ICMPV6_MLD2_REPORT:
+ return true;
+ }
+
+ return false;
+}
+
/**
* batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential
* @bat_priv: the bat priv with all the soft interface information
@@ -341,6 +675,9 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr)))
return -ENOMEM;
+ if (batadv_mcast_is_report_ipv6(skb))
+ return -EINVAL;
+
ip6hdr = ipv6_hdr(skb);
/* TODO: Implement Multicast Router Discovery (RFC4286),
@@ -357,6 +694,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
return 0;
}
+#endif
/**
* batadv_mcast_forw_mode_check - check for optimized forwarding potential
@@ -385,9 +723,11 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
case ETH_P_IP:
return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
is_unsnoopable);
+#if IS_ENABLED(CONFIG_IPV6)
case ETH_P_IPV6:
return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb,
is_unsnoopable);
+#endif
default:
return -EINVAL;
}
@@ -728,18 +1068,18 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
}
/**
- * batadv_mcast_tvlv_ogm_handler_v1 - process incoming multicast tvlv container
+ * batadv_mcast_tvlv_ogm_handler - process incoming multicast tvlv container
* @bat_priv: the bat priv with all the soft interface information
* @orig: the orig_node of the ogm
* @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
* @tvlv_value: tvlv buffer containing the multicast data
* @tvlv_value_len: tvlv buffer length
*/
-static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig,
- u8 flags,
- void *tvlv_value,
- u16 tvlv_value_len)
+static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags,
+ void *tvlv_value,
+ u16 tvlv_value_len)
{
bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
u8 mcast_flags = BATADV_NO_FLAGS;
@@ -789,19 +1129,120 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
*/
void batadv_mcast_init(struct batadv_priv *bat_priv)
{
- batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler_v1,
- NULL, BATADV_TVLV_MCAST, 1,
+ batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler,
+ NULL, BATADV_TVLV_MCAST, 2,
BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
}
/**
+ * batadv_mcast_flags_print_header - print own mcast flags to debugfs table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq: debugfs table seq_file struct
+ *
+ * Prints our own multicast flags including a more specific reason why
+ * they are set, that is prints the bridge and querier state too, to
+ * the debugfs table specified via @seq.
+ */
+static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv,
+ struct seq_file *seq)
+{
+ u8 flags = bat_priv->mcast.flags;
+ char querier4, querier6, shadowing4, shadowing6;
+ bool bridged = bat_priv->mcast.bridged;
+
+ if (bridged) {
+ querier4 = bat_priv->mcast.querier_ipv4.exists ? '.' : '4';
+ querier6 = bat_priv->mcast.querier_ipv6.exists ? '.' : '6';
+ shadowing4 = bat_priv->mcast.querier_ipv4.shadowing ? '4' : '.';
+ shadowing6 = bat_priv->mcast.querier_ipv6.shadowing ? '6' : '.';
+ } else {
+ querier4 = '?';
+ querier6 = '?';
+ shadowing4 = '?';
+ shadowing6 = '?';
+ }
+
+ seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n",
+ (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
+ (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
+ (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+ seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.');
+ seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n",
+ querier4, querier6);
+ seq_printf(seq, "* Shadowing IGMP/MLD Querier [4/6]:\t%c/%c\n",
+ shadowing4, shadowing6);
+ seq_puts(seq, "-------------------------------------------\n");
+ seq_printf(seq, " %-10s %s\n", "Originator", "Flags");
+}
+
+/**
+ * batadv_mcast_flags_seq_print_text - print the mcast flags of other nodes
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * This prints a table of (primary) originators and their according
+ * multicast flags, including (in the header) our own.
+ *
+ * Return: always 0
+ */
+int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hard_iface *primary_if;
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct batadv_orig_node *orig_node;
+ struct hlist_head *head;
+ u8 flags;
+ u32 i;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ return 0;
+
+ batadv_mcast_flags_print_header(bat_priv, seq);
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
+ &orig_node->capa_initialized))
+ continue;
+
+ if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
+ &orig_node->capabilities)) {
+ seq_printf(seq, "%pM -\n", orig_node->orig);
+ continue;
+ }
+
+ flags = orig_node->mcast_flags;
+
+ seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig,
+ (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)
+ ? 'U' : '.',
+ (flags & BATADV_MCAST_WANT_ALL_IPV4)
+ ? '4' : '.',
+ (flags & BATADV_MCAST_WANT_ALL_IPV6)
+ ? '6' : '.');
+ }
+ rcu_read_unlock();
+ }
+
+ batadv_hardif_put(primary_if);
+
+ return 0;
+}
+
+/**
* batadv_mcast_free - free the multicast optimizations structures
* @bat_priv: the bat priv with all the soft interface information
*/
void batadv_mcast_free(struct batadv_priv *bat_priv)
{
- batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1);
- batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1);
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
spin_lock_bh(&bat_priv->tt.commit_lock);
batadv_mcast_mla_tt_retract(bat_priv, NULL);
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 80bceec55..1fb00ba84 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -20,6 +20,7 @@
#include "main.h"
+struct seq_file;
struct sk_buff;
/**
@@ -46,6 +47,8 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
void batadv_mcast_init(struct batadv_priv *bat_priv);
+int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset);
+
void batadv_mcast_free(struct batadv_priv *bat_priv);
void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node);
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
new file mode 100644
index 000000000..231f8eaf0
--- /dev/null
+++ b/net/batman-adv/netlink.c
@@ -0,0 +1,424 @@
+/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+ *
+ * Matthias Schiffer
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "netlink.h"
+#include "main.h"
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genetlink.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/printk.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "hard-interface.h"
+#include "soft-interface.h"
+#include "tp_meter.h"
+
+struct sk_buff;
+
+static struct genl_family batadv_netlink_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = BATADV_NL_NAME,
+ .version = 1,
+ .maxattr = BATADV_ATTR_MAX,
+};
+
+/* multicast groups */
+enum batadv_netlink_multicast_groups {
+ BATADV_NL_MCGRP_TPMETER,
+};
+
+static struct genl_multicast_group batadv_netlink_mcgrps[] = {
+ [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER },
+};
+
+static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
+ [BATADV_ATTR_VERSION] = { .type = NLA_STRING },
+ [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING },
+ [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 },
+ [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING },
+ [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 },
+ [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING },
+ [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 },
+ [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 },
+ [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 },
+ [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 },
+};
+
+/**
+ * batadv_netlink_mesh_info_put - fill in generic information about mesh
+ * interface
+ * @msg: netlink message to be sent back
+ * @soft_iface: interface for which the data should be taken
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int
+batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_hard_iface *primary_if = NULL;
+ struct net_device *hard_iface;
+ int ret = -ENOBUFS;
+
+ if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) ||
+ nla_put_string(msg, BATADV_ATTR_ALGO_NAME,
+ bat_priv->algo_ops->name) ||
+ nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) ||
+ nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) ||
+ nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN,
+ soft_iface->dev_addr))
+ goto out;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) {
+ hard_iface = primary_if->net_dev;
+
+ if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ hard_iface->ifindex) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ hard_iface->name) ||
+ nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN,
+ hard_iface->dev_addr))
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+
+ return ret;
+}
+
+/**
+ * batadv_netlink_get_mesh_info - handle incoming BATADV_CMD_GET_MESH_INFO
+ * netlink request
+ * @skb: received netlink message
+ * @info: receiver information
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int
+batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct net_device *soft_iface;
+ struct sk_buff *msg = NULL;
+ void *msg_head;
+ int ifindex;
+ int ret;
+
+ if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &batadv_netlink_family, 0,
+ BATADV_CMD_GET_MESH_INFO);
+ if (!msg_head) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ ret = batadv_netlink_mesh_info_put(msg, soft_iface);
+
+ out:
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ if (ret) {
+ if (msg)
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ genlmsg_end(msg, msg_head);
+ return genlmsg_reply(msg, info);
+}
+
+/**
+ * batadv_netlink_tp_meter_put - Fill information of started tp_meter session
+ * @msg: netlink message to be sent back
+ * @cookie: tp meter session cookie
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int
+batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie)
+{
+ if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie))
+ return -ENOBUFS;
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_tpmeter_notify - send tp_meter result via netlink to client
+ * @bat_priv: the bat priv with all the soft interface information
+ * @dst: destination of tp_meter session
+ * @result: reason for tp meter session stop
+ * @test_time: total time ot the tp_meter session
+ * @total_bytes: bytes acked to the receiver
+ * @cookie: cookie of tp_meter session
+ *
+ * Return: 0 on success, < 0 on error
+ */
+int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
+ u8 result, u32 test_time, u64 total_bytes,
+ u32 cookie)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &batadv_netlink_family, 0,
+ BATADV_CMD_TP_METER);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto err_genlmsg;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_TPMETER_TEST_TIME, test_time))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, BATADV_ATTR_TPMETER_BYTES, total_bytes,
+ BATADV_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, BATADV_ATTR_TPMETER_RESULT, result))
+ goto nla_put_failure;
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, dst))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&batadv_netlink_family,
+ dev_net(bat_priv->soft_iface), msg, 0,
+ BATADV_NL_MCGRP_TPMETER, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ ret = -EMSGSIZE;
+
+err_genlmsg:
+ nlmsg_free(msg);
+ return ret;
+}
+
+/**
+ * batadv_netlink_tp_meter_start - Start a new tp_meter session
+ * @skb: received netlink message
+ * @info: receiver information
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int
+batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct net_device *soft_iface;
+ struct batadv_priv *bat_priv;
+ struct sk_buff *msg = NULL;
+ u32 test_length;
+ void *msg_head;
+ int ifindex;
+ u32 cookie;
+ u8 *dst;
+ int ret;
+
+ if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
+ return -EINVAL;
+
+ if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS])
+ return -EINVAL;
+
+ if (!info->attrs[BATADV_ATTR_TPMETER_TEST_TIME])
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
+ if (!ifindex)
+ return -EINVAL;
+
+ dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]);
+
+ test_length = nla_get_u32(info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]);
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &batadv_netlink_family, 0,
+ BATADV_CMD_TP_METER);
+ if (!msg_head) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ bat_priv = netdev_priv(soft_iface);
+ batadv_tp_start(bat_priv, dst, test_length, &cookie);
+
+ ret = batadv_netlink_tp_meter_put(msg, cookie);
+
+ out:
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ if (ret) {
+ if (msg)
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ genlmsg_end(msg, msg_head);
+ return genlmsg_reply(msg, info);
+}
+
+/**
+ * batadv_netlink_tp_meter_start - Cancel a running tp_meter session
+ * @skb: received netlink message
+ * @info: receiver information
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int
+batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct net_device *soft_iface;
+ struct batadv_priv *bat_priv;
+ int ifindex;
+ u8 *dst;
+ int ret = 0;
+
+ if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
+ return -EINVAL;
+
+ if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS])
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
+ if (!ifindex)
+ return -EINVAL;
+
+ dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]);
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bat_priv = netdev_priv(soft_iface);
+ batadv_tp_stop(bat_priv, dst, BATADV_TP_REASON_CANCEL);
+
+out:
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ return ret;
+}
+
+static struct genl_ops batadv_netlink_ops[] = {
+ {
+ .cmd = BATADV_CMD_GET_MESH_INFO,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .doit = batadv_netlink_get_mesh_info,
+ },
+ {
+ .cmd = BATADV_CMD_TP_METER,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .doit = batadv_netlink_tp_meter_start,
+ },
+ {
+ .cmd = BATADV_CMD_TP_METER_CANCEL,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .doit = batadv_netlink_tp_meter_cancel,
+ },
+};
+
+/**
+ * batadv_netlink_register - register batadv genl netlink family
+ */
+void __init batadv_netlink_register(void)
+{
+ int ret;
+
+ ret = genl_register_family_with_ops_groups(&batadv_netlink_family,
+ batadv_netlink_ops,
+ batadv_netlink_mcgrps);
+ if (ret)
+ pr_warn("unable to register netlink family");
+}
+
+/**
+ * batadv_netlink_unregister - unregister batadv genl netlink family
+ */
+void batadv_netlink_unregister(void)
+{
+ genl_unregister_family(&batadv_netlink_family);
+}
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
new file mode 100644
index 000000000..945653ab5
--- /dev/null
+++ b/net/batman-adv/netlink.h
@@ -0,0 +1,32 @@
+/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+ *
+ * Matthias Schiffer
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_NETLINK_H_
+#define _NET_BATMAN_ADV_NETLINK_H_
+
+#include "main.h"
+
+#include <linux/types.h>
+
+void batadv_netlink_register(void);
+void batadv_netlink_unregister(void);
+
+int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
+ u8 result, u32 test_time, u64 total_bytes,
+ u32 cookie);
+
+#endif /* _NET_BATMAN_ADV_NETLINK_H_ */
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 678f06865..293ef4ffd 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -51,10 +51,12 @@
#include "hard-interface.h"
#include "hash.h"
+#include "log.h"
#include "originator.h"
#include "packet.h"
#include "routing.h"
#include "send.h"
+#include "tvlv.h"
static struct lock_class_key batadv_nc_coding_hash_lock_class_key;
static struct lock_class_key batadv_nc_decoding_hash_lock_class_key;
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index ab8c4f973..3940b5d24 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -34,11 +34,13 @@
#include <linux/spinlock.h>
#include <linux/workqueue.h>
+#include "bat_algo.h"
#include "distributed-arp-table.h"
#include "fragmentation.h"
#include "gateway_client.h"
#include "hard-interface.h"
#include "hash.h"
+#include "log.h"
#include "multicast.h"
#include "network-coding.h"
#include "routing.h"
@@ -251,10 +253,8 @@ static void batadv_neigh_node_release(struct kref *ref)
struct hlist_node *node_tmp;
struct batadv_neigh_node *neigh_node;
struct batadv_neigh_ifinfo *neigh_ifinfo;
- struct batadv_algo_ops *bao;
neigh_node = container_of(ref, struct batadv_neigh_node, refcount);
- bao = neigh_node->orig_node->bat_priv->bat_algo_ops;
hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
&neigh_node->ifinfo_list, list) {
@@ -263,9 +263,6 @@ static void batadv_neigh_node_release(struct kref *ref)
batadv_hardif_neigh_put(neigh_node->hardif_neigh);
- if (bao->bat_neigh_free)
- bao->bat_neigh_free(neigh_node);
-
batadv_hardif_put(neigh_node->if_incoming);
kfree_rcu(neigh_node, rcu);
@@ -537,8 +534,8 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
kref_init(&hardif_neigh->refcount);
- if (bat_priv->bat_algo_ops->bat_hardif_neigh_init)
- bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh);
+ if (bat_priv->algo_ops->neigh.hardif_init)
+ bat_priv->algo_ops->neigh.hardif_init(hardif_neigh);
hlist_add_head(&hardif_neigh->list, &hard_iface->neigh_list);
@@ -602,19 +599,19 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
}
/**
- * batadv_neigh_node_new - create and init a new neigh_node object
+ * batadv_neigh_node_create - create a neigh node object
* @orig_node: originator object representing the neighbour
* @hard_iface: the interface where the neighbour is connected to
* @neigh_addr: the mac address of the neighbour interface
*
* Allocates a new neigh_node object and initialises all the generic fields.
*
- * Return: neighbor when found. Othwerwise NULL
+ * Return: the neighbour node if found or created or NULL otherwise.
*/
-struct batadv_neigh_node *
-batadv_neigh_node_new(struct batadv_orig_node *orig_node,
- struct batadv_hard_iface *hard_iface,
- const u8 *neigh_addr)
+static struct batadv_neigh_node *
+batadv_neigh_node_create(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
{
struct batadv_neigh_node *neigh_node;
struct batadv_hardif_neigh_node *hardif_neigh = NULL;
@@ -667,6 +664,29 @@ out:
}
/**
+ * batadv_neigh_node_get_or_create - retrieve or create a neigh node object
+ * @orig_node: originator object representing the neighbour
+ * @hard_iface: the interface where the neighbour is connected to
+ * @neigh_addr: the mac address of the neighbour interface
+ *
+ * Return: the neighbour node if found or created or NULL otherwise.
+ */
+struct batadv_neigh_node *
+batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
+{
+ struct batadv_neigh_node *neigh_node = NULL;
+
+ /* first check without locking to avoid the overhead */
+ neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
+ if (neigh_node)
+ return neigh_node;
+
+ return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr);
+}
+
+/**
* batadv_hardif_neigh_seq_print_text - print the single hop neighbour list
* @seq: neighbour table seq_file struct
* @offset: not used
@@ -686,17 +706,17 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
BATADV_SOURCE_VERSION, primary_if->net_dev->name,
primary_if->net_dev->dev_addr, net_dev->name,
- bat_priv->bat_algo_ops->name);
+ bat_priv->algo_ops->name);
batadv_hardif_put(primary_if);
- if (!bat_priv->bat_algo_ops->bat_neigh_print) {
+ if (!bat_priv->algo_ops->neigh.print) {
seq_puts(seq,
"No printing function for this routing protocol\n");
return 0;
}
- bat_priv->bat_algo_ops->bat_neigh_print(bat_priv, seq);
+ bat_priv->algo_ops->neigh.print(bat_priv, seq);
return 0;
}
@@ -747,8 +767,8 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
batadv_frag_purge_orig(orig_node, NULL);
- if (orig_node->bat_priv->bat_algo_ops->bat_orig_free)
- orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node);
+ if (orig_node->bat_priv->algo_ops->orig.free)
+ orig_node->bat_priv->algo_ops->orig.free(orig_node);
kfree(orig_node->tt_buff);
kfree(orig_node);
@@ -1092,12 +1112,12 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
struct batadv_hard_iface *if_outgoing)
{
struct batadv_neigh_node *best = NULL, *neigh;
- struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ struct batadv_algo_ops *bao = bat_priv->algo_ops;
rcu_read_lock();
hlist_for_each_entry_rcu(neigh, &orig_node->neigh_list, list) {
- if (best && (bao->bat_neigh_cmp(neigh, if_outgoing,
- best, if_outgoing) <= 0))
+ if (best && (bao->neigh.cmp(neigh, if_outgoing, best,
+ if_outgoing) <= 0))
continue;
if (!kref_get_unless_zero(&neigh->refcount))
@@ -1249,18 +1269,17 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
BATADV_SOURCE_VERSION, primary_if->net_dev->name,
primary_if->net_dev->dev_addr, net_dev->name,
- bat_priv->bat_algo_ops->name);
+ bat_priv->algo_ops->name);
batadv_hardif_put(primary_if);
- if (!bat_priv->bat_algo_ops->bat_orig_print) {
+ if (!bat_priv->algo_ops->orig.print) {
seq_puts(seq,
"No printing function for this routing protocol\n");
return 0;
}
- bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq,
- BATADV_IF_DEFAULT);
+ bat_priv->algo_ops->orig.print(bat_priv, seq, BATADV_IF_DEFAULT);
return 0;
}
@@ -1287,7 +1306,7 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
}
bat_priv = netdev_priv(hard_iface->soft_iface);
- if (!bat_priv->bat_algo_ops->bat_orig_print) {
+ if (!bat_priv->algo_ops->orig.print) {
seq_puts(seq,
"No printing function for this routing protocol\n");
goto out;
@@ -1301,9 +1320,9 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
seq_printf(seq, "[B.A.T.M.A.N. adv %s, IF/MAC: %s/%pM (%s %s)]\n",
BATADV_SOURCE_VERSION, hard_iface->net_dev->name,
hard_iface->net_dev->dev_addr,
- hard_iface->soft_iface->name, bat_priv->bat_algo_ops->name);
+ hard_iface->soft_iface->name, bat_priv->algo_ops->name);
- bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, hard_iface);
+ bat_priv->algo_ops->orig.print(bat_priv, seq, hard_iface);
out:
if (hard_iface)
@@ -1315,7 +1334,7 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
int max_if_num)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ struct batadv_algo_ops *bao = bat_priv->algo_ops;
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_head *head;
struct batadv_orig_node *orig_node;
@@ -1331,9 +1350,8 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
rcu_read_lock();
hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
ret = 0;
- if (bao->bat_orig_add_if)
- ret = bao->bat_orig_add_if(orig_node,
- max_if_num);
+ if (bao->orig.add_if)
+ ret = bao->orig.add_if(orig_node, max_if_num);
if (ret == -ENOMEM)
goto err;
}
@@ -1355,7 +1373,7 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
struct hlist_head *head;
struct batadv_hard_iface *hard_iface_tmp;
struct batadv_orig_node *orig_node;
- struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ struct batadv_algo_ops *bao = bat_priv->algo_ops;
u32 i;
int ret;
@@ -1368,10 +1386,9 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
rcu_read_lock();
hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
ret = 0;
- if (bao->bat_orig_del_if)
- ret = bao->bat_orig_del_if(orig_node,
- max_if_num,
- hard_iface->if_num);
+ if (bao->orig.del_if)
+ ret = bao->orig.del_if(orig_node, max_if_num,
+ hard_iface->if_num);
if (ret == -ENOMEM)
goto err;
}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 64a8951e5..566306bf0 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -46,9 +46,9 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
void
batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh);
struct batadv_neigh_node *
-batadv_neigh_node_new(struct batadv_orig_node *orig_node,
- struct batadv_hard_iface *hard_iface,
- const u8 *neigh_addr);
+batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr);
void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node);
struct batadv_neigh_node *
batadv_orig_router_get(struct batadv_orig_node *orig_node,
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 372128ddb..6b011ff64 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -21,6 +21,8 @@
#include <asm/byteorder.h>
#include <linux/types.h>
+#define batadv_tp_is_error(n) ((u8)n > 127 ? 1 : 0)
+
/**
* enum batadv_packettype - types for batman-adv encapsulated packets
* @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
@@ -93,6 +95,7 @@ enum batadv_icmp_packettype {
BATADV_ECHO_REQUEST = 8,
BATADV_TTL_EXCEEDED = 11,
BATADV_PARAMETER_PROBLEM = 12,
+ BATADV_TP = 15,
};
/**
@@ -285,6 +288,16 @@ struct batadv_elp_packet {
#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
/**
+ * enum batadv_icmp_user_cmd_type - types for batman-adv icmp cmd modes
+ * @BATADV_TP_START: start a throughput meter run
+ * @BATADV_TP_STOP: stop a throughput meter run
+ */
+enum batadv_icmp_user_cmd_type {
+ BATADV_TP_START = 0,
+ BATADV_TP_STOP = 2,
+};
+
+/**
* struct batadv_icmp_header - common members among all the ICMP packets
* @packet_type: batman-adv packet type, part of the general header
* @version: batman-adv protocol version, part of the genereal header
@@ -334,6 +347,47 @@ struct batadv_icmp_packet {
__be16 seqno;
};
+/**
+ * struct batadv_icmp_tp_packet - ICMP TP Meter packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @subtype: TP packet subtype (see batadv_icmp_tp_subtype)
+ * @session: TP session identifier
+ * @seqno: the TP sequence number
+ * @timestamp: time when the packet has been sent. This value is filled in a
+ * TP_MSG and echoed back in the next TP_ACK so that the sender can compute the
+ * RTT. Since it is read only by the host which wrote it, there is no need to
+ * store it using network order
+ */
+struct batadv_icmp_tp_packet {
+ u8 packet_type;
+ u8 version;
+ u8 ttl;
+ u8 msg_type; /* see ICMP message types above */
+ u8 dst[ETH_ALEN];
+ u8 orig[ETH_ALEN];
+ u8 uid;
+ u8 subtype;
+ u8 session[2];
+ __be32 seqno;
+ __be32 timestamp;
+};
+
+/**
+ * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes
+ * @BATADV_TP_MSG: Msg from sender to receiver
+ * @BATADV_TP_ACK: acknowledgment from receiver to sender
+ */
+enum batadv_icmp_tp_subtype {
+ BATADV_TP_MSG = 0,
+ BATADV_TP_ACK,
+};
+
#define BATADV_RR_LEN 16
/**
@@ -420,6 +474,7 @@ struct batadv_unicast_4addr_packet {
* @dest: final destination used when routing fragments
* @orig: originator of the fragment used when merging the packet
* @no: fragment number within this sequence
+ * @priority: priority of frame, from ToS IP precedence or 802.1p
* @reserved: reserved byte for alignment
* @seqno: sequence identification
* @total_size: size of the merged packet
@@ -430,9 +485,11 @@ struct batadv_frag_packet {
u8 ttl;
#if defined(__BIG_ENDIAN_BITFIELD)
u8 no:4;
- u8 reserved:4;
+ u8 priority:3;
+ u8 reserved:1;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 reserved:4;
+ u8 reserved:1;
+ u8 priority:3;
u8 no:4;
#else
#error "unknown bitfield endianness"
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index bfac086b4..3d199478c 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -40,12 +40,15 @@
#include "fragmentation.h"
#include "hard-interface.h"
#include "icmp_socket.h"
+#include "log.h"
#include "network-coding.h"
#include "originator.h"
#include "packet.h"
#include "send.h"
#include "soft-interface.h"
+#include "tp_meter.h"
#include "translation-table.h"
+#include "tvlv.h"
static int batadv_route_unicast_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if);
@@ -268,10 +271,19 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
icmph->ttl = BATADV_TTL;
res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res != NET_XMIT_DROP)
- ret = NET_RX_SUCCESS;
+ if (res == -1)
+ goto out;
+
+ ret = NET_RX_SUCCESS;
break;
+ case BATADV_TP:
+ if (!pskb_may_pull(skb, sizeof(struct batadv_icmp_tp_packet)))
+ goto out;
+
+ batadv_tp_meter_recv(bat_priv, skb);
+ ret = NET_RX_SUCCESS;
+ goto out;
default:
/* drop unknown type */
goto out;
@@ -290,7 +302,7 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if = NULL;
struct batadv_orig_node *orig_node = NULL;
struct batadv_icmp_packet *icmp_packet;
- int ret = NET_RX_DROP;
+ int res, ret = NET_RX_DROP;
icmp_packet = (struct batadv_icmp_packet *)skb->data;
@@ -321,7 +333,8 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
icmp_packet->msg_type = BATADV_TTL_EXCEEDED;
icmp_packet->ttl = BATADV_TTL;
- if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP)
+ res = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (res != -1)
ret = NET_RX_SUCCESS;
out:
@@ -341,7 +354,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
struct ethhdr *ethhdr;
struct batadv_orig_node *orig_node = NULL;
int hdr_size = sizeof(struct batadv_icmp_header);
- int ret = NET_RX_DROP;
+ int res, ret = NET_RX_DROP;
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
@@ -408,7 +421,8 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
icmph->ttl--;
/* route it */
- if (batadv_send_skb_to_orig(skb, orig_node, recv_if) != NET_XMIT_DROP)
+ res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
+ if (res != -1)
ret = NET_RX_SUCCESS;
out:
@@ -456,6 +470,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
}
/**
+ * batadv_last_bonding_get - Get last_bonding_candidate of orig_node
+ * @orig_node: originator node whose last bonding candidate should be retrieved
+ *
+ * Return: last bonding candidate of router or NULL if not found
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+static struct batadv_orig_ifinfo *
+batadv_last_bonding_get(struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_ifinfo *last_bonding_candidate;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ last_bonding_candidate = orig_node->last_bonding_candidate;
+
+ if (last_bonding_candidate)
+ kref_get(&last_bonding_candidate->refcount);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ return last_bonding_candidate;
+}
+
+/**
* batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node
* @orig_node: originator node whose bonding candidates should be replaced
* @new_candidate: new bonding candidate or NULL
@@ -492,7 +529,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
struct batadv_hard_iface *recv_if)
{
- struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ struct batadv_algo_ops *bao = bat_priv->algo_ops;
struct batadv_neigh_node *first_candidate_router = NULL;
struct batadv_neigh_node *next_candidate_router = NULL;
struct batadv_neigh_node *router, *cand_router = NULL;
@@ -525,7 +562,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
* router - obviously there are no other candidates.
*/
rcu_read_lock();
- last_candidate = orig_node->last_bonding_candidate;
+ last_candidate = batadv_last_bonding_get(orig_node);
if (last_candidate)
last_cand_router = rcu_dereference(last_candidate->router);
@@ -546,9 +583,9 @@ batadv_find_router(struct batadv_priv *bat_priv,
/* alternative candidate should be good enough to be
* considered
*/
- if (!bao->bat_neigh_is_similar_or_better(cand_router,
- cand->if_outgoing,
- router, recv_if))
+ if (!bao->neigh.is_similar_or_better(cand_router,
+ cand->if_outgoing, router,
+ recv_if))
goto next;
/* don't use the same router twice */
@@ -617,6 +654,9 @@ next:
batadv_orig_ifinfo_put(next_candidate);
}
+ if (last_candidate)
+ batadv_orig_ifinfo_put(last_candidate);
+
return router;
}
@@ -671,6 +711,8 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
len = skb->len;
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
+ if (res == -1)
+ goto out;
/* translate transmit result into receive result */
if (res == NET_XMIT_SUCCESS) {
@@ -678,13 +720,10 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
len + ETH_HLEN);
-
- ret = NET_RX_SUCCESS;
- } else if (res == NET_XMIT_POLICED) {
- /* skb was buffered and consumed */
- ret = NET_RX_SUCCESS;
}
+ ret = NET_RX_SUCCESS;
+
out:
if (orig_node)
batadv_orig_node_put(orig_node);
@@ -1033,6 +1072,8 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
if (!orig_node_src)
goto out;
+ skb->priority = frag_packet->priority + 256;
+
/* Route the fragment if it is not for us and too big to be merged. */
if (!batadv_is_my_mac(bat_priv, frag_packet->dest) &&
batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) {
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 010397650..619115948 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -20,10 +20,11 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/fs.h>
-#include <linux/if_ether.h>
#include <linux/if.h>
+#include <linux/if_ether.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/kref.h>
@@ -42,6 +43,7 @@
#include "fragmentation.h"
#include "gateway_client.h"
#include "hard-interface.h"
+#include "log.h"
#include "network-coding.h"
#include "originator.h"
#include "routing.h"
@@ -71,6 +73,7 @@ int batadv_send_skb_packet(struct sk_buff *skb,
{
struct batadv_priv *bat_priv;
struct ethhdr *ethhdr;
+ int ret;
bat_priv = netdev_priv(hard_iface->soft_iface);
@@ -108,8 +111,15 @@ int batadv_send_skb_packet(struct sk_buff *skb,
/* dev_queue_xmit() returns a negative result on error. However on
* congestion and traffic shaping, it drops and returns NET_XMIT_DROP
* (which is > 0). This will not be treated as an error.
+ *
+ * a negative value cannot be returned because it could be interepreted
+ * as not consumed skb by callers of batadv_send_skb_to_orig.
*/
- return dev_queue_xmit(skb);
+ ret = dev_queue_xmit(skb);
+ if (ret < 0)
+ ret = NET_XMIT_DROP;
+
+ return ret;
send_skb_err:
kfree_skb(skb);
return NET_XMIT_DROP;
@@ -155,8 +165,11 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
* host, NULL can be passed as recv_if and no interface alternating is
* attempted.
*
- * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or
- * NET_XMIT_POLICED if the skb is buffered for later transmit.
+ * Return: -1 on failure (and the skb is not consumed), -EINPROGRESS if the
+ * skb is buffered for later transmit or the NET_XMIT status returned by the
+ * lower routine if the packet has been passed down.
+ *
+ * If the returning value is not -1 the skb has been consumed.
*/
int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -164,7 +177,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
{
struct batadv_priv *bat_priv = orig_node->bat_priv;
struct batadv_neigh_node *neigh_node;
- int ret = NET_XMIT_DROP;
+ int ret = -1;
/* batadv_find_router() increases neigh_nodes refcount if found. */
neigh_node = batadv_find_router(bat_priv, orig_node, recv_if);
@@ -177,8 +190,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
if (atomic_read(&bat_priv->fragmentation) &&
skb->len > neigh_node->if_incoming->net_dev->mtu) {
/* Fragment and send packet. */
- if (batadv_frag_send_packet(skb, orig_node, neigh_node))
- ret = NET_XMIT_SUCCESS;
+ ret = batadv_frag_send_packet(skb, orig_node, neigh_node);
goto out;
}
@@ -187,12 +199,10 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
* (i.e. being forwarded). If the packet originates from this node or if
* network coding fails, then send the packet as usual.
*/
- if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) {
- ret = NET_XMIT_POLICED;
- } else {
- batadv_send_unicast_skb(skb, neigh_node);
- ret = NET_XMIT_SUCCESS;
- }
+ if (recv_if && batadv_nc_skb_forward(skb, neigh_node))
+ ret = -EINPROGRESS;
+ else
+ ret = batadv_send_unicast_skb(skb, neigh_node);
out:
if (neigh_node)
@@ -318,7 +328,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
{
struct batadv_unicast_packet *unicast_packet;
struct ethhdr *ethhdr;
- int ret = NET_XMIT_DROP;
+ int res, ret = NET_XMIT_DROP;
if (!orig_node)
goto out;
@@ -355,7 +365,8 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid))
unicast_packet->ttvn = unicast_packet->ttvn - 1;
- if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP)
+ res = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (res != -1)
ret = NET_XMIT_SUCCESS;
out:
@@ -428,27 +439,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
BATADV_P_DATA, orig_node, vid);
}
-void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface)
-{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
-
- if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
- (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
- return;
-
- /* the interface gets activated here to avoid race conditions between
- * the moment of activating the interface in
- * hardif_activate_interface() where the originator mac is set and
- * outdated packets (especially uninitialized mac addresses) in the
- * packet queue
- */
- if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED)
- hard_iface->if_status = BATADV_IF_ACTIVE;
-
- bat_priv->bat_algo_ops->bat_ogm_schedule(hard_iface);
-}
-
-static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
+void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
{
kfree_skb(forw_packet->skb);
if (forw_packet->if_incoming)
@@ -604,45 +595,6 @@ out:
atomic_inc(&bat_priv->bcast_queue_left);
}
-void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work)
-{
- struct delayed_work *delayed_work;
- struct batadv_forw_packet *forw_packet;
- struct batadv_priv *bat_priv;
-
- delayed_work = to_delayed_work(work);
- forw_packet = container_of(delayed_work, struct batadv_forw_packet,
- delayed_work);
- bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
- spin_lock_bh(&bat_priv->forw_bat_list_lock);
- hlist_del(&forw_packet->list);
- spin_unlock_bh(&bat_priv->forw_bat_list_lock);
-
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
- goto out;
-
- bat_priv->bat_algo_ops->bat_ogm_emit(forw_packet);
-
- /* we have to have at least one packet in the queue to determine the
- * queues wake up time unless we are shutting down.
- *
- * only re-schedule if this is the "original" copy, e.g. the OGM of the
- * primary interface should only be rescheduled once per period, but
- * this function will be called for the forw_packet instances of the
- * other secondary interfaces as well.
- */
- if (forw_packet->own &&
- forw_packet->if_incoming == forw_packet->if_outgoing)
- batadv_schedule_bat_ogm(forw_packet->if_incoming);
-
-out:
- /* don't count own packet */
- if (!forw_packet->own)
- atomic_inc(&bat_priv->batman_queue_left);
-
- batadv_forw_packet_free(forw_packet);
-}
-
void
batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
const struct batadv_hard_iface *hard_iface)
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 6fd7270d8..7cecb7563 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -26,8 +26,8 @@
#include "packet.h"
struct sk_buff;
-struct work_struct;
+void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet);
int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
struct batadv_hard_iface *recv_if);
@@ -38,11 +38,9 @@ int batadv_send_broadcast_skb(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface);
int batadv_send_unicast_skb(struct sk_buff *skb,
struct batadv_neigh_node *neigh_node);
-void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface);
int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
const struct sk_buff *skb,
unsigned long delay);
-void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work);
void
batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
const struct batadv_hard_iface *hard_iface);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 287a3879e..7527c0652 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -48,6 +48,7 @@
#include <linux/types.h>
#include <linux/workqueue.h>
+#include "bat_algo.h"
#include "bridge_loop_avoidance.h"
#include "debugfs.h"
#include "distributed-arp-table.h"
@@ -255,7 +256,7 @@ static int batadv_interface_tx(struct sk_buff *skb,
if (batadv_compare_eth(ethhdr->h_dest, ectp_addr))
goto dropped;
- gw_mode = atomic_read(&bat_priv->gw_mode);
+ gw_mode = atomic_read(&bat_priv->gw.mode);
if (is_multicast_ether_addr(ethhdr->h_dest)) {
/* if gw mode is off, broadcast every packet */
if (gw_mode == BATADV_GW_MODE_OFF) {
@@ -808,6 +809,10 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->distributed_arp_table, 1);
#endif
#ifdef CONFIG_BATMAN_ADV_MCAST
+ bat_priv->mcast.querier_ipv4.exists = false;
+ bat_priv->mcast.querier_ipv4.shadowing = false;
+ bat_priv->mcast.querier_ipv6.exists = false;
+ bat_priv->mcast.querier_ipv6.shadowing = false;
bat_priv->mcast.flags = BATADV_NO_FLAGS;
atomic_set(&bat_priv->multicast_mode, 1);
atomic_set(&bat_priv->mcast.num_disabled, 0);
@@ -815,8 +820,8 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0);
atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
#endif
- atomic_set(&bat_priv->gw_mode, BATADV_GW_MODE_OFF);
- atomic_set(&bat_priv->gw_sel_class, 20);
+ atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF);
+ atomic_set(&bat_priv->gw.sel_class, 20);
atomic_set(&bat_priv->gw.bandwidth_down, 100);
atomic_set(&bat_priv->gw.bandwidth_up, 20);
atomic_set(&bat_priv->orig_interval, 1000);
@@ -837,6 +842,8 @@ static int batadv_softif_init_late(struct net_device *dev)
#ifdef CONFIG_BATMAN_ADV_BLA
atomic_set(&bat_priv->bla.num_requests, 0);
#endif
+ atomic_set(&bat_priv->tp_num, 0);
+
bat_priv->tt.last_changeset = NULL;
bat_priv->tt.last_changeset_len = 0;
bat_priv->isolation_mark = 0;
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 414b20741..fe9ca94dd 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -25,8 +25,8 @@
#include <linux/fs.h>
#include <linux/if.h>
#include <linux/if_vlan.h>
-#include <linux/kref.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
#include <linux/rculist.h>
@@ -38,11 +38,12 @@
#include <linux/string.h>
#include <linux/stringify.h>
+#include "bridge_loop_avoidance.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
#include "gateway_common.h"
-#include "bridge_loop_avoidance.h"
#include "hard-interface.h"
+#include "log.h"
#include "network-coding.h"
#include "packet.h"
#include "soft-interface.h"
@@ -389,12 +390,12 @@ static int batadv_store_uint_attr(const char *buff, size_t count,
return count;
}
-static inline ssize_t
-__batadv_store_uint_attr(const char *buff, size_t count,
- int min, int max,
- void (*post_func)(struct net_device *),
- const struct attribute *attr,
- atomic_t *attr_store, struct net_device *net_dev)
+static ssize_t __batadv_store_uint_attr(const char *buff, size_t count,
+ int min, int max,
+ void (*post_func)(struct net_device *),
+ const struct attribute *attr,
+ atomic_t *attr_store,
+ struct net_device *net_dev)
{
int ret;
@@ -411,7 +412,7 @@ static ssize_t batadv_show_bat_algo(struct kobject *kobj,
{
struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
- return sprintf(buff, "%s\n", bat_priv->bat_algo_ops->name);
+ return sprintf(buff, "%s\n", bat_priv->algo_ops->name);
}
static void batadv_post_gw_reselect(struct net_device *net_dev)
@@ -427,7 +428,7 @@ static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr,
struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
int bytes_written;
- switch (atomic_read(&bat_priv->gw_mode)) {
+ switch (atomic_read(&bat_priv->gw.mode)) {
case BATADV_GW_MODE_CLIENT:
bytes_written = sprintf(buff, "%s\n",
BATADV_GW_MODE_CLIENT_NAME);
@@ -476,10 +477,10 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj,
return -EINVAL;
}
- if (atomic_read(&bat_priv->gw_mode) == gw_mode_tmp)
+ if (atomic_read(&bat_priv->gw.mode) == gw_mode_tmp)
return count;
- switch (atomic_read(&bat_priv->gw_mode)) {
+ switch (atomic_read(&bat_priv->gw.mode)) {
case BATADV_GW_MODE_CLIENT:
curr_gw_mode_str = BATADV_GW_MODE_CLIENT_NAME;
break;
@@ -508,7 +509,7 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj,
* state
*/
batadv_gw_check_client_stop(bat_priv);
- atomic_set(&bat_priv->gw_mode, (unsigned int)gw_mode_tmp);
+ atomic_set(&bat_priv->gw.mode, (unsigned int)gw_mode_tmp);
batadv_gw_tvlv_container_update(bat_priv);
return count;
}
@@ -624,7 +625,7 @@ BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR,
2 * BATADV_JITTER, INT_MAX, NULL);
BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0,
BATADV_TQ_MAX_VALUE, NULL);
-BATADV_ATTR_SIF_UINT(gw_sel_class, gw_sel_class, S_IRUGO | S_IWUSR, 1,
+BATADV_ATTR_SIF_UINT(gw_sel_class, gw.sel_class, S_IRUGO | S_IWUSR, 1,
BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect);
static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth,
batadv_store_gw_bwidth);
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
new file mode 100644
index 000000000..2333777f9
--- /dev/null
+++ b/net/batman-adv/tp_meter.c
@@ -0,0 +1,1507 @@
+/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+ *
+ * Edo Monticelli, Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "tp_meter.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/param.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "hard-interface.h"
+#include "log.h"
+#include "netlink.h"
+#include "originator.h"
+#include "packet.h"
+#include "send.h"
+
+/**
+ * BATADV_TP_DEF_TEST_LENGTH - Default test length if not specified by the user
+ * in milliseconds
+ */
+#define BATADV_TP_DEF_TEST_LENGTH 10000
+
+/**
+ * BATADV_TP_AWND - Advertised window by the receiver (in bytes)
+ */
+#define BATADV_TP_AWND 0x20000000
+
+/**
+ * BATADV_TP_RECV_TIMEOUT - Receiver activity timeout. If the receiver does not
+ * get anything for such amount of milliseconds, the connection is killed
+ */
+#define BATADV_TP_RECV_TIMEOUT 1000
+
+/**
+ * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond
+ * such amound of milliseconds, the receiver is considered unreachable and the
+ * connection is killed
+ */
+#define BATADV_TP_MAX_RTO 30000
+
+/**
+ * BATADV_TP_FIRST_SEQ - First seqno of each session. The number is rather high
+ * in order to immediately trigger a wrap around (test purposes)
+ */
+#define BATADV_TP_FIRST_SEQ ((u32)-1 - 2000)
+
+/**
+ * BATADV_TP_PLEN - length of the payload (data after the batadv_unicast header)
+ * to simulate
+ */
+#define BATADV_TP_PLEN (BATADV_TP_PACKET_LEN - ETH_HLEN - \
+ sizeof(struct batadv_unicast_packet))
+
+static u8 batadv_tp_prerandom[4096] __read_mostly;
+
+/**
+ * batadv_tp_session_cookie - generate session cookie based on session ids
+ * @session: TP session identifier
+ * @icmp_uid: icmp pseudo uid of the tp session
+ *
+ * Return: 32 bit tp_meter session cookie
+ */
+static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid)
+{
+ u32 cookie;
+
+ cookie = icmp_uid << 16;
+ cookie |= session[0] << 8;
+ cookie |= session[1];
+
+ return cookie;
+}
+
+/**
+ * batadv_tp_cwnd - compute the new cwnd size
+ * @base: base cwnd size value
+ * @increment: the value to add to base to get the new size
+ * @min: minumim cwnd value (usually MSS)
+ *
+ * Return the new cwnd size and ensures it does not exceed the Advertised
+ * Receiver Window size. It is wrap around safe.
+ * For details refer to Section 3.1 of RFC5681
+ *
+ * Return: new congestion window size in bytes
+ */
+static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min)
+{
+ u32 new_size = base + increment;
+
+ /* check for wrap-around */
+ if (new_size < base)
+ new_size = (u32)ULONG_MAX;
+
+ new_size = min_t(u32, new_size, BATADV_TP_AWND);
+
+ return max_t(u32, new_size, min);
+}
+
+/**
+ * batadv_tp_updated_cwnd - update the Congestion Windows
+ * @tp_vars: the private data of the current TP meter session
+ * @mss: maximum segment size of transmission
+ *
+ * 1) if the session is in Slow Start, the CWND has to be increased by 1
+ * MSS every unique received ACK
+ * 2) if the session is in Congestion Avoidance, the CWND has to be
+ * increased by MSS * MSS / CWND for every unique received ACK
+ */
+static void batadv_tp_update_cwnd(struct batadv_tp_vars *tp_vars, u32 mss)
+{
+ spin_lock_bh(&tp_vars->cwnd_lock);
+
+ /* slow start... */
+ if (tp_vars->cwnd <= tp_vars->ss_threshold) {
+ tp_vars->dec_cwnd = 0;
+ tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss);
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+ return;
+ }
+
+ /* increment CWND at least of 1 (section 3.1 of RFC5681) */
+ tp_vars->dec_cwnd += max_t(u32, 1U << 3,
+ ((mss * mss) << 6) / (tp_vars->cwnd << 3));
+ if (tp_vars->dec_cwnd < (mss << 3)) {
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+ return;
+ }
+
+ tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss);
+ tp_vars->dec_cwnd = 0;
+
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+}
+
+/**
+ * batadv_tp_update_rto - calculate new retransmission timeout
+ * @tp_vars: the private data of the current TP meter session
+ * @new_rtt: new roundtrip time in msec
+ */
+static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars,
+ u32 new_rtt)
+{
+ long m = new_rtt;
+
+ /* RTT update
+ * Details in Section 2.2 and 2.3 of RFC6298
+ *
+ * It's tricky to understand. Don't lose hair please.
+ * Inspired by tcp_rtt_estimator() tcp_input.c
+ */
+ if (tp_vars->srtt != 0) {
+ m -= (tp_vars->srtt >> 3); /* m is now error in rtt est */
+ tp_vars->srtt += m; /* rtt = 7/8 srtt + 1/8 new */
+ if (m < 0)
+ m = -m;
+
+ m -= (tp_vars->rttvar >> 2);
+ tp_vars->rttvar += m; /* mdev ~= 3/4 rttvar + 1/4 new */
+ } else {
+ /* first measure getting in */
+ tp_vars->srtt = m << 3; /* take the measured time to be srtt */
+ tp_vars->rttvar = m << 1; /* new_rtt / 2 */
+ }
+
+ /* rto = srtt + 4 * rttvar.
+ * rttvar is scaled by 4, therefore doesn't need to be multiplied
+ */
+ tp_vars->rto = (tp_vars->srtt >> 3) + tp_vars->rttvar;
+}
+
+/**
+ * batadv_tp_batctl_notify - send client status result to client
+ * @reason: reason for tp meter session stop
+ * @dst: destination of tp_meter session
+ * @bat_priv: the bat priv with all the soft interface information
+ * @start_time: start of transmission in jiffies
+ * @total_sent: bytes acked to the receiver
+ * @cookie: cookie of tp_meter session
+ */
+static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason,
+ const u8 *dst, struct batadv_priv *bat_priv,
+ unsigned long start_time, u64 total_sent,
+ u32 cookie)
+{
+ u32 test_time;
+ u8 result;
+ u32 total_bytes;
+
+ if (!batadv_tp_is_error(reason)) {
+ result = BATADV_TP_REASON_COMPLETE;
+ test_time = jiffies_to_msecs(jiffies - start_time);
+ total_bytes = total_sent;
+ } else {
+ result = reason;
+ test_time = 0;
+ total_bytes = 0;
+ }
+
+ batadv_netlink_tpmeter_notify(bat_priv, dst, result, test_time,
+ total_bytes, cookie);
+}
+
+/**
+ * batadv_tp_batctl_error_notify - send client error result to client
+ * @reason: reason for tp meter session stop
+ * @dst: destination of tp_meter session
+ * @bat_priv: the bat priv with all the soft interface information
+ * @cookie: cookie of tp_meter session
+ */
+static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason,
+ const u8 *dst,
+ struct batadv_priv *bat_priv,
+ u32 cookie)
+{
+ batadv_tp_batctl_notify(reason, dst, bat_priv, 0, 0, cookie);
+}
+
+/**
+ * batadv_tp_list_find - find a tp_vars object in the global list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @dst: the other endpoint MAC address to look for
+ *
+ * Look for a tp_vars object matching dst as end_point and return it after
+ * having incremented the refcounter. Return NULL is not found
+ *
+ * Return: matching tp_vars or NULL when no tp_vars with @dst was found
+ */
+static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv,
+ const u8 *dst)
+{
+ struct batadv_tp_vars *pos, *tp_vars = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) {
+ if (!batadv_compare_eth(pos->other_end, dst))
+ continue;
+
+ /* most of the time this function is invoked during the normal
+ * process..it makes sens to pay more when the session is
+ * finished and to speed the process up during the measurement
+ */
+ if (unlikely(!kref_get_unless_zero(&pos->refcount)))
+ continue;
+
+ tp_vars = pos;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tp_vars;
+}
+
+/**
+ * batadv_tp_list_find_session - find tp_vars session object in the global list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @dst: the other endpoint MAC address to look for
+ * @session: session identifier
+ *
+ * Look for a tp_vars object matching dst as end_point, session as tp meter
+ * session and return it after having incremented the refcounter. Return NULL
+ * is not found
+ *
+ * Return: matching tp_vars or NULL when no tp_vars was found
+ */
+static struct batadv_tp_vars *
+batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst,
+ const u8 *session)
+{
+ struct batadv_tp_vars *pos, *tp_vars = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) {
+ if (!batadv_compare_eth(pos->other_end, dst))
+ continue;
+
+ if (memcmp(pos->session, session, sizeof(pos->session)) != 0)
+ continue;
+
+ /* most of the time this function is invoked during the normal
+ * process..it makes sense to pay more when the session is
+ * finished and to speed the process up during the measurement
+ */
+ if (unlikely(!kref_get_unless_zero(&pos->refcount)))
+ continue;
+
+ tp_vars = pos;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tp_vars;
+}
+
+/**
+ * batadv_tp_vars_release - release batadv_tp_vars from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the batadv_tp_vars
+ */
+static void batadv_tp_vars_release(struct kref *ref)
+{
+ struct batadv_tp_vars *tp_vars;
+ struct batadv_tp_unacked *un, *safe;
+
+ tp_vars = container_of(ref, struct batadv_tp_vars, refcount);
+
+ /* lock should not be needed because this object is now out of any
+ * context!
+ */
+ spin_lock_bh(&tp_vars->unacked_lock);
+ list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
+ list_del(&un->list);
+ kfree(un);
+ }
+ spin_unlock_bh(&tp_vars->unacked_lock);
+
+ kfree_rcu(tp_vars, rcu);
+}
+
+/**
+ * batadv_tp_vars_put - decrement the batadv_tp_vars refcounter and possibly
+ * release it
+ * @tp_vars: the private data of the current TP meter session to be free'd
+ */
+static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars)
+{
+ kref_put(&tp_vars->refcount, batadv_tp_vars_release);
+}
+
+/**
+ * batadv_tp_sender_cleanup - cleanup sender data and drop and timer
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tp_vars: the private data of the current TP meter session to cleanup
+ */
+static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv,
+ struct batadv_tp_vars *tp_vars)
+{
+ cancel_delayed_work(&tp_vars->finish_work);
+
+ spin_lock_bh(&tp_vars->bat_priv->tp_list_lock);
+ hlist_del_rcu(&tp_vars->list);
+ spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock);
+
+ /* drop list reference */
+ batadv_tp_vars_put(tp_vars);
+
+ atomic_dec(&tp_vars->bat_priv->tp_num);
+
+ /* kill the timer and remove its reference */
+ del_timer_sync(&tp_vars->timer);
+ /* the worker might have rearmed itself therefore we kill it again. Note
+ * that if the worker should run again before invoking the following
+ * del_timer(), it would not re-arm itself once again because the status
+ * is OFF now
+ */
+ del_timer(&tp_vars->timer);
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_sender_end - print info about ended session and inform client
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tp_vars: the private data of the current TP meter session
+ */
+static void batadv_tp_sender_end(struct batadv_priv *bat_priv,
+ struct batadv_tp_vars *tp_vars)
+{
+ u32 session_cookie;
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Test towards %pM finished..shutting down (reason=%d)\n",
+ tp_vars->other_end, tp_vars->reason);
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Last timing stats: SRTT=%ums RTTVAR=%ums RTO=%ums\n",
+ tp_vars->srtt >> 3, tp_vars->rttvar >> 2, tp_vars->rto);
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Final values: cwnd=%u ss_threshold=%u\n",
+ tp_vars->cwnd, tp_vars->ss_threshold);
+
+ session_cookie = batadv_tp_session_cookie(tp_vars->session,
+ tp_vars->icmp_uid);
+
+ batadv_tp_batctl_notify(tp_vars->reason,
+ tp_vars->other_end,
+ bat_priv,
+ tp_vars->start_time,
+ atomic64_read(&tp_vars->tot_sent),
+ session_cookie);
+}
+
+/**
+ * batadv_tp_sender_shutdown - let sender thread/timer stop gracefully
+ * @tp_vars: the private data of the current TP meter session
+ * @reason: reason for tp meter session stop
+ */
+static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars,
+ enum batadv_tp_meter_reason reason)
+{
+ if (!atomic_dec_and_test(&tp_vars->sending))
+ return;
+
+ tp_vars->reason = reason;
+}
+
+/**
+ * batadv_tp_sender_finish - stop sender session after test_length was reached
+ * @work: delayed work reference of the related tp_vars
+ */
+static void batadv_tp_sender_finish(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_tp_vars *tp_vars;
+
+ delayed_work = to_delayed_work(work);
+ tp_vars = container_of(delayed_work, struct batadv_tp_vars,
+ finish_work);
+
+ batadv_tp_sender_shutdown(tp_vars, BATADV_TP_REASON_COMPLETE);
+}
+
+/**
+ * batadv_tp_reset_sender_timer - reschedule the sender timer
+ * @tp_vars: the private TP meter data for this session
+ *
+ * Reschedule the timer using tp_vars->rto as delay
+ */
+static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars)
+{
+ /* most of the time this function is invoked while normal packet
+ * reception...
+ */
+ if (unlikely(atomic_read(&tp_vars->sending) == 0))
+ /* timer ref will be dropped in batadv_tp_sender_cleanup */
+ return;
+
+ mod_timer(&tp_vars->timer, jiffies + msecs_to_jiffies(tp_vars->rto));
+}
+
+/**
+ * batadv_tp_sender_timeout - timer that fires in case of packet loss
+ * @arg: address of the related tp_vars
+ *
+ * If fired it means that there was packet loss.
+ * Switch to Slow Start, set the ss_threshold to half of the current cwnd and
+ * reset the cwnd to 3*MSS
+ */
+static void batadv_tp_sender_timeout(unsigned long arg)
+{
+ struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg;
+ struct batadv_priv *bat_priv = tp_vars->bat_priv;
+
+ if (atomic_read(&tp_vars->sending) == 0)
+ return;
+
+ /* if the user waited long enough...shutdown the test */
+ if (unlikely(tp_vars->rto >= BATADV_TP_MAX_RTO)) {
+ batadv_tp_sender_shutdown(tp_vars,
+ BATADV_TP_REASON_DST_UNREACHABLE);
+ return;
+ }
+
+ /* RTO exponential backoff
+ * Details in Section 5.5 of RFC6298
+ */
+ tp_vars->rto <<= 1;
+
+ spin_lock_bh(&tp_vars->cwnd_lock);
+
+ tp_vars->ss_threshold = tp_vars->cwnd >> 1;
+ if (tp_vars->ss_threshold < BATADV_TP_PLEN * 2)
+ tp_vars->ss_threshold = BATADV_TP_PLEN * 2;
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: RTO fired during test towards %pM! cwnd=%u new ss_thr=%u, resetting last_sent to %u\n",
+ tp_vars->other_end, tp_vars->cwnd, tp_vars->ss_threshold,
+ atomic_read(&tp_vars->last_acked));
+
+ tp_vars->cwnd = BATADV_TP_PLEN * 3;
+
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+
+ /* resend the non-ACKed packets.. */
+ tp_vars->last_sent = atomic_read(&tp_vars->last_acked);
+ wake_up(&tp_vars->more_bytes);
+
+ batadv_tp_reset_sender_timer(tp_vars);
+}
+
+/**
+ * batadv_tp_fill_prerandom - Fill buffer with prefetched random bytes
+ * @tp_vars: the private TP meter data for this session
+ * @buf: Buffer to fill with bytes
+ * @nbytes: amount of pseudorandom bytes
+ */
+static void batadv_tp_fill_prerandom(struct batadv_tp_vars *tp_vars,
+ u8 *buf, size_t nbytes)
+{
+ u32 local_offset;
+ size_t bytes_inbuf;
+ size_t to_copy;
+ size_t pos = 0;
+
+ spin_lock_bh(&tp_vars->prerandom_lock);
+ local_offset = tp_vars->prerandom_offset;
+ tp_vars->prerandom_offset += nbytes;
+ tp_vars->prerandom_offset %= sizeof(batadv_tp_prerandom);
+ spin_unlock_bh(&tp_vars->prerandom_lock);
+
+ while (nbytes) {
+ local_offset %= sizeof(batadv_tp_prerandom);
+ bytes_inbuf = sizeof(batadv_tp_prerandom) - local_offset;
+ to_copy = min(nbytes, bytes_inbuf);
+
+ memcpy(&buf[pos], &batadv_tp_prerandom[local_offset], to_copy);
+ pos += to_copy;
+ nbytes -= to_copy;
+ local_offset = 0;
+ }
+}
+
+/**
+ * batadv_tp_send_msg - send a single message
+ * @tp_vars: the private TP meter data for this session
+ * @src: source mac address
+ * @orig_node: the originator of the destination
+ * @seqno: sequence number of this packet
+ * @len: length of the entire packet
+ * @session: session identifier
+ * @uid: local ICMP "socket" index
+ * @timestamp: timestamp in jiffies which is replied in ack
+ *
+ * Create and send a single TP Meter message.
+ *
+ * Return: 0 on success, BATADV_TP_REASON_DST_UNREACHABLE if the destination is
+ * not reachable, BATADV_TP_REASON_MEMORY_ERROR if the packet couldn't be
+ * allocated
+ */
+static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src,
+ struct batadv_orig_node *orig_node,
+ u32 seqno, size_t len, const u8 *session,
+ int uid, u32 timestamp)
+{
+ struct batadv_icmp_tp_packet *icmp;
+ struct sk_buff *skb;
+ int r;
+ u8 *data;
+ size_t data_len;
+
+ skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN);
+ if (unlikely(!skb))
+ return BATADV_TP_REASON_MEMORY_ERROR;
+
+ skb_reserve(skb, ETH_HLEN);
+ icmp = (struct batadv_icmp_tp_packet *)skb_put(skb, sizeof(*icmp));
+
+ /* fill the icmp header */
+ ether_addr_copy(icmp->dst, orig_node->orig);
+ ether_addr_copy(icmp->orig, src);
+ icmp->version = BATADV_COMPAT_VERSION;
+ icmp->packet_type = BATADV_ICMP;
+ icmp->ttl = BATADV_TTL;
+ icmp->msg_type = BATADV_TP;
+ icmp->uid = uid;
+
+ icmp->subtype = BATADV_TP_MSG;
+ memcpy(icmp->session, session, sizeof(icmp->session));
+ icmp->seqno = htonl(seqno);
+ icmp->timestamp = htonl(timestamp);
+
+ data_len = len - sizeof(*icmp);
+ data = (u8 *)skb_put(skb, data_len);
+ batadv_tp_fill_prerandom(tp_vars, data, data_len);
+
+ r = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (r == -1)
+ kfree_skb(skb);
+
+ if (r == NET_XMIT_SUCCESS)
+ return 0;
+
+ return BATADV_TP_REASON_CANT_SEND;
+}
+
+/**
+ * batadv_tp_recv_ack - ACK receiving function
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the buffer containing the received packet
+ *
+ * Process a received TP ACK packet
+ */
+static void batadv_tp_recv_ack(struct batadv_priv *bat_priv,
+ const struct sk_buff *skb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ const struct batadv_icmp_tp_packet *icmp;
+ struct batadv_tp_vars *tp_vars;
+ size_t packet_len, mss;
+ u32 rtt, recv_ack, cwnd;
+ unsigned char *dev_addr;
+
+ packet_len = BATADV_TP_PLEN;
+ mss = BATADV_TP_PLEN;
+ packet_len += sizeof(struct batadv_unicast_packet);
+
+ icmp = (struct batadv_icmp_tp_packet *)skb->data;
+
+ /* find the tp_vars */
+ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
+ icmp->session);
+ if (unlikely(!tp_vars))
+ return;
+
+ if (unlikely(atomic_read(&tp_vars->sending) == 0))
+ goto out;
+
+ /* old ACK? silently drop it.. */
+ if (batadv_seq_before(ntohl(icmp->seqno),
+ (u32)atomic_read(&tp_vars->last_acked)))
+ goto out;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (unlikely(!primary_if))
+ goto out;
+
+ orig_node = batadv_orig_hash_find(bat_priv, icmp->orig);
+ if (unlikely(!orig_node))
+ goto out;
+
+ /* update RTO with the new sampled RTT, if any */
+ rtt = jiffies_to_msecs(jiffies) - ntohl(icmp->timestamp);
+ if (icmp->timestamp && rtt)
+ batadv_tp_update_rto(tp_vars, rtt);
+
+ /* ACK for new data... reset the timer */
+ batadv_tp_reset_sender_timer(tp_vars);
+
+ recv_ack = ntohl(icmp->seqno);
+
+ /* check if this ACK is a duplicate */
+ if (atomic_read(&tp_vars->last_acked) == recv_ack) {
+ atomic_inc(&tp_vars->dup_acks);
+ if (atomic_read(&tp_vars->dup_acks) != 3)
+ goto out;
+
+ if (recv_ack >= tp_vars->recover)
+ goto out;
+
+ /* if this is the third duplicate ACK do Fast Retransmit */
+ batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr,
+ orig_node, recv_ack, packet_len,
+ icmp->session, icmp->uid,
+ jiffies_to_msecs(jiffies));
+
+ spin_lock_bh(&tp_vars->cwnd_lock);
+
+ /* Fast Recovery */
+ tp_vars->fast_recovery = true;
+ /* Set recover to the last outstanding seqno when Fast Recovery
+ * is entered. RFC6582, Section 3.2, step 1
+ */
+ tp_vars->recover = tp_vars->last_sent;
+ tp_vars->ss_threshold = tp_vars->cwnd >> 1;
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: Fast Recovery, (cur cwnd=%u) ss_thr=%u last_sent=%u recv_ack=%u\n",
+ tp_vars->cwnd, tp_vars->ss_threshold,
+ tp_vars->last_sent, recv_ack);
+ tp_vars->cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 3 * mss,
+ mss);
+ tp_vars->dec_cwnd = 0;
+ tp_vars->last_sent = recv_ack;
+
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+ } else {
+ /* count the acked data */
+ atomic64_add(recv_ack - atomic_read(&tp_vars->last_acked),
+ &tp_vars->tot_sent);
+ /* reset the duplicate ACKs counter */
+ atomic_set(&tp_vars->dup_acks, 0);
+
+ if (tp_vars->fast_recovery) {
+ /* partial ACK */
+ if (batadv_seq_before(recv_ack, tp_vars->recover)) {
+ /* this is another hole in the window. React
+ * immediately as specified by NewReno (see
+ * Section 3.2 of RFC6582 for details)
+ */
+ dev_addr = primary_if->net_dev->dev_addr;
+ batadv_tp_send_msg(tp_vars, dev_addr,
+ orig_node, recv_ack,
+ packet_len, icmp->session,
+ icmp->uid,
+ jiffies_to_msecs(jiffies));
+ tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd,
+ mss, mss);
+ } else {
+ tp_vars->fast_recovery = false;
+ /* set cwnd to the value of ss_threshold at the
+ * moment that Fast Recovery was entered.
+ * RFC6582, Section 3.2, step 3
+ */
+ cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 0,
+ mss);
+ tp_vars->cwnd = cwnd;
+ }
+ goto move_twnd;
+ }
+
+ if (recv_ack - atomic_read(&tp_vars->last_acked) >= mss)
+ batadv_tp_update_cwnd(tp_vars, mss);
+move_twnd:
+ /* move the Transmit Window */
+ atomic_set(&tp_vars->last_acked, recv_ack);
+ }
+
+ wake_up(&tp_vars->more_bytes);
+out:
+ if (likely(primary_if))
+ batadv_hardif_put(primary_if);
+ if (likely(orig_node))
+ batadv_orig_node_put(orig_node);
+ if (likely(tp_vars))
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_avail - check if congestion window is not full
+ * @tp_vars: the private data of the current TP meter session
+ * @payload_len: size of the payload of a single message
+ *
+ * Return: true when congestion window is not full, false otherwise
+ */
+static bool batadv_tp_avail(struct batadv_tp_vars *tp_vars,
+ size_t payload_len)
+{
+ u32 win_left, win_limit;
+
+ win_limit = atomic_read(&tp_vars->last_acked) + tp_vars->cwnd;
+ win_left = win_limit - tp_vars->last_sent;
+
+ return win_left >= payload_len;
+}
+
+/**
+ * batadv_tp_wait_available - wait until congestion window becomes free or
+ * timeout is reached
+ * @tp_vars: the private data of the current TP meter session
+ * @plen: size of the payload of a single message
+ *
+ * Return: 0 if the condition evaluated to false after the timeout elapsed,
+ * 1 if the condition evaluated to true after the timeout elapsed, the
+ * remaining jiffies (at least 1) if the condition evaluated to true before
+ * the timeout elapsed, or -ERESTARTSYS if it was interrupted by a signal.
+ */
+static int batadv_tp_wait_available(struct batadv_tp_vars *tp_vars, size_t plen)
+{
+ int ret;
+
+ ret = wait_event_interruptible_timeout(tp_vars->more_bytes,
+ batadv_tp_avail(tp_vars, plen),
+ HZ / 10);
+
+ return ret;
+}
+
+/**
+ * batadv_tp_send - main sending thread of a tp meter session
+ * @arg: address of the related tp_vars
+ *
+ * Return: nothing, this function never returns
+ */
+static int batadv_tp_send(void *arg)
+{
+ struct batadv_tp_vars *tp_vars = arg;
+ struct batadv_priv *bat_priv = tp_vars->bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ size_t payload_len, packet_len;
+ int err = 0;
+
+ if (unlikely(tp_vars->role != BATADV_TP_SENDER)) {
+ err = BATADV_TP_REASON_DST_UNREACHABLE;
+ tp_vars->reason = err;
+ goto out;
+ }
+
+ orig_node = batadv_orig_hash_find(bat_priv, tp_vars->other_end);
+ if (unlikely(!orig_node)) {
+ err = BATADV_TP_REASON_DST_UNREACHABLE;
+ tp_vars->reason = err;
+ goto out;
+ }
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (unlikely(!primary_if)) {
+ err = BATADV_TP_REASON_DST_UNREACHABLE;
+ goto out;
+ }
+
+ /* assume that all the hard_interfaces have a correctly
+ * configured MTU, so use the soft_iface MTU as MSS.
+ * This might not be true and in that case the fragmentation
+ * should be used.
+ * Now, try to send the packet as it is
+ */
+ payload_len = BATADV_TP_PLEN;
+ BUILD_BUG_ON(sizeof(struct batadv_icmp_tp_packet) > BATADV_TP_PLEN);
+
+ batadv_tp_reset_sender_timer(tp_vars);
+
+ /* queue the worker in charge of terminating the test */
+ queue_delayed_work(batadv_event_workqueue, &tp_vars->finish_work,
+ msecs_to_jiffies(tp_vars->test_length));
+
+ while (atomic_read(&tp_vars->sending) != 0) {
+ if (unlikely(!batadv_tp_avail(tp_vars, payload_len))) {
+ batadv_tp_wait_available(tp_vars, payload_len);
+ continue;
+ }
+
+ /* to emulate normal unicast traffic, add to the payload len
+ * the size of the unicast header
+ */
+ packet_len = payload_len + sizeof(struct batadv_unicast_packet);
+
+ err = batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr,
+ orig_node, tp_vars->last_sent,
+ packet_len,
+ tp_vars->session, tp_vars->icmp_uid,
+ jiffies_to_msecs(jiffies));
+
+ /* something went wrong during the preparation/transmission */
+ if (unlikely(err && err != BATADV_TP_REASON_CANT_SEND)) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: batadv_tp_send() cannot send packets (%d)\n",
+ err);
+ /* ensure nobody else tries to stop the thread now */
+ if (atomic_dec_and_test(&tp_vars->sending))
+ tp_vars->reason = err;
+ break;
+ }
+
+ /* right-shift the TWND */
+ if (!err)
+ tp_vars->last_sent += payload_len;
+
+ cond_resched();
+ }
+
+out:
+ if (likely(primary_if))
+ batadv_hardif_put(primary_if);
+ if (likely(orig_node))
+ batadv_orig_node_put(orig_node);
+
+ batadv_tp_sender_end(bat_priv, tp_vars);
+ batadv_tp_sender_cleanup(bat_priv, tp_vars);
+
+ batadv_tp_vars_put(tp_vars);
+
+ do_exit(0);
+}
+
+/**
+ * batadv_tp_start_kthread - start new thread which manages the tp meter sender
+ * @tp_vars: the private data of the current TP meter session
+ */
+static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars)
+{
+ struct task_struct *kthread;
+ struct batadv_priv *bat_priv = tp_vars->bat_priv;
+ u32 session_cookie;
+
+ kref_get(&tp_vars->refcount);
+ kthread = kthread_create(batadv_tp_send, tp_vars, "kbatadv_tp_meter");
+ if (IS_ERR(kthread)) {
+ session_cookie = batadv_tp_session_cookie(tp_vars->session,
+ tp_vars->icmp_uid);
+ pr_err("batadv: cannot create tp meter kthread\n");
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR,
+ tp_vars->other_end,
+ bat_priv, session_cookie);
+
+ /* drop reserved reference for kthread */
+ batadv_tp_vars_put(tp_vars);
+
+ /* cleanup of failed tp meter variables */
+ batadv_tp_sender_cleanup(bat_priv, tp_vars);
+ return;
+ }
+
+ wake_up_process(kthread);
+}
+
+/**
+ * batadv_tp_start - start a new tp meter session
+ * @bat_priv: the bat priv with all the soft interface information
+ * @dst: the receiver MAC address
+ * @test_length: test length in milliseconds
+ * @cookie: session cookie
+ */
+void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
+ u32 test_length, u32 *cookie)
+{
+ struct batadv_tp_vars *tp_vars;
+ u8 session_id[2];
+ u8 icmp_uid;
+ u32 session_cookie;
+
+ get_random_bytes(session_id, sizeof(session_id));
+ get_random_bytes(&icmp_uid, 1);
+ session_cookie = batadv_tp_session_cookie(session_id, icmp_uid);
+ *cookie = session_cookie;
+
+ /* look for an already existing test towards this node */
+ spin_lock_bh(&bat_priv->tp_list_lock);
+ tp_vars = batadv_tp_list_find(bat_priv, dst);
+ if (tp_vars) {
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+ batadv_tp_vars_put(tp_vars);
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: test to or from the same node already ongoing, aborting\n");
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_ALREADY_ONGOING,
+ dst, bat_priv, session_cookie);
+ return;
+ }
+
+ if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) {
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: too many ongoing sessions, aborting (SEND)\n");
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_TOO_MANY, dst,
+ bat_priv, session_cookie);
+ return;
+ }
+
+ tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC);
+ if (!tp_vars) {
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: batadv_tp_start cannot allocate list elements\n");
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR,
+ dst, bat_priv, session_cookie);
+ return;
+ }
+
+ /* initialize tp_vars */
+ ether_addr_copy(tp_vars->other_end, dst);
+ kref_init(&tp_vars->refcount);
+ tp_vars->role = BATADV_TP_SENDER;
+ atomic_set(&tp_vars->sending, 1);
+ memcpy(tp_vars->session, session_id, sizeof(session_id));
+ tp_vars->icmp_uid = icmp_uid;
+
+ tp_vars->last_sent = BATADV_TP_FIRST_SEQ;
+ atomic_set(&tp_vars->last_acked, BATADV_TP_FIRST_SEQ);
+ tp_vars->fast_recovery = false;
+ tp_vars->recover = BATADV_TP_FIRST_SEQ;
+
+ /* initialise the CWND to 3*MSS (Section 3.1 in RFC5681).
+ * For batman-adv the MSS is the size of the payload received by the
+ * soft_interface, hence its MTU
+ */
+ tp_vars->cwnd = BATADV_TP_PLEN * 3;
+ /* at the beginning initialise the SS threshold to the biggest possible
+ * window size, hence the AWND size
+ */
+ tp_vars->ss_threshold = BATADV_TP_AWND;
+
+ /* RTO initial value is 3 seconds.
+ * Details in Section 2.1 of RFC6298
+ */
+ tp_vars->rto = 1000;
+ tp_vars->srtt = 0;
+ tp_vars->rttvar = 0;
+
+ atomic64_set(&tp_vars->tot_sent, 0);
+
+ kref_get(&tp_vars->refcount);
+ setup_timer(&tp_vars->timer, batadv_tp_sender_timeout,
+ (unsigned long)tp_vars);
+
+ tp_vars->bat_priv = bat_priv;
+ tp_vars->start_time = jiffies;
+
+ init_waitqueue_head(&tp_vars->more_bytes);
+
+ spin_lock_init(&tp_vars->unacked_lock);
+ INIT_LIST_HEAD(&tp_vars->unacked_list);
+
+ spin_lock_init(&tp_vars->cwnd_lock);
+
+ tp_vars->prerandom_offset = 0;
+ spin_lock_init(&tp_vars->prerandom_lock);
+
+ kref_get(&tp_vars->refcount);
+ hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list);
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+
+ tp_vars->test_length = test_length;
+ if (!tp_vars->test_length)
+ tp_vars->test_length = BATADV_TP_DEF_TEST_LENGTH;
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: starting throughput meter towards %pM (length=%ums)\n",
+ dst, test_length);
+
+ /* init work item for finished tp tests */
+ INIT_DELAYED_WORK(&tp_vars->finish_work, batadv_tp_sender_finish);
+
+ /* start tp kthread. This way the write() call issued from userspace can
+ * happily return and avoid to block
+ */
+ batadv_tp_start_kthread(tp_vars);
+
+ /* don't return reference to new tp_vars */
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_stop - stop currently running tp meter session
+ * @bat_priv: the bat priv with all the soft interface information
+ * @dst: the receiver MAC address
+ * @return_value: reason for tp meter session stop
+ */
+void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst,
+ u8 return_value)
+{
+ struct batadv_orig_node *orig_node;
+ struct batadv_tp_vars *tp_vars;
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: stopping test towards %pM\n", dst);
+
+ orig_node = batadv_orig_hash_find(bat_priv, dst);
+ if (!orig_node)
+ return;
+
+ tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig);
+ if (!tp_vars) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: trying to interrupt an already over connection\n");
+ goto out;
+ }
+
+ batadv_tp_sender_shutdown(tp_vars, return_value);
+ batadv_tp_vars_put(tp_vars);
+out:
+ batadv_orig_node_put(orig_node);
+}
+
+/**
+ * batadv_tp_reset_receiver_timer - reset the receiver shutdown timer
+ * @tp_vars: the private data of the current TP meter session
+ *
+ * start the receiver shutdown timer or reset it if already started
+ */
+static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars)
+{
+ mod_timer(&tp_vars->timer,
+ jiffies + msecs_to_jiffies(BATADV_TP_RECV_TIMEOUT));
+}
+
+/**
+ * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is
+ * reached without received ack
+ * @arg: address of the related tp_vars
+ */
+static void batadv_tp_receiver_shutdown(unsigned long arg)
+{
+ struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg;
+ struct batadv_tp_unacked *un, *safe;
+ struct batadv_priv *bat_priv;
+
+ bat_priv = tp_vars->bat_priv;
+
+ /* if there is recent activity rearm the timer */
+ if (!batadv_has_timed_out(tp_vars->last_recv_time,
+ BATADV_TP_RECV_TIMEOUT)) {
+ /* reset the receiver shutdown timer */
+ batadv_tp_reset_receiver_timer(tp_vars);
+ return;
+ }
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Shutting down for inactivity (more than %dms) from %pM\n",
+ BATADV_TP_RECV_TIMEOUT, tp_vars->other_end);
+
+ spin_lock_bh(&tp_vars->bat_priv->tp_list_lock);
+ hlist_del_rcu(&tp_vars->list);
+ spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock);
+
+ /* drop list reference */
+ batadv_tp_vars_put(tp_vars);
+
+ atomic_dec(&bat_priv->tp_num);
+
+ spin_lock_bh(&tp_vars->unacked_lock);
+ list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
+ list_del(&un->list);
+ kfree(un);
+ }
+ spin_unlock_bh(&tp_vars->unacked_lock);
+
+ /* drop reference of timer */
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_send_ack - send an ACK packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @dst: the mac address of the destination originator
+ * @seq: the sequence number to ACK
+ * @timestamp: the timestamp to echo back in the ACK
+ * @session: session identifier
+ * @socket_index: local ICMP socket identifier
+ *
+ * Return: 0 on success, a positive integer representing the reason of the
+ * failure otherwise
+ */
+static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
+ u32 seq, __be32 timestamp, const u8 *session,
+ int socket_index)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node;
+ struct batadv_icmp_tp_packet *icmp;
+ struct sk_buff *skb;
+ int r, ret;
+
+ orig_node = batadv_orig_hash_find(bat_priv, dst);
+ if (unlikely(!orig_node)) {
+ ret = BATADV_TP_REASON_DST_UNREACHABLE;
+ goto out;
+ }
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (unlikely(!primary_if)) {
+ ret = BATADV_TP_REASON_DST_UNREACHABLE;
+ goto out;
+ }
+
+ skb = netdev_alloc_skb_ip_align(NULL, sizeof(*icmp) + ETH_HLEN);
+ if (unlikely(!skb)) {
+ ret = BATADV_TP_REASON_MEMORY_ERROR;
+ goto out;
+ }
+
+ skb_reserve(skb, ETH_HLEN);
+ icmp = (struct batadv_icmp_tp_packet *)skb_put(skb, sizeof(*icmp));
+ icmp->packet_type = BATADV_ICMP;
+ icmp->version = BATADV_COMPAT_VERSION;
+ icmp->ttl = BATADV_TTL;
+ icmp->msg_type = BATADV_TP;
+ ether_addr_copy(icmp->dst, orig_node->orig);
+ ether_addr_copy(icmp->orig, primary_if->net_dev->dev_addr);
+ icmp->uid = socket_index;
+
+ icmp->subtype = BATADV_TP_ACK;
+ memcpy(icmp->session, session, sizeof(icmp->session));
+ icmp->seqno = htonl(seq);
+ icmp->timestamp = timestamp;
+
+ /* send the ack */
+ r = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (r == -1)
+ kfree_skb(skb);
+
+ if (unlikely(r < 0) || (r == NET_XMIT_DROP)) {
+ ret = BATADV_TP_REASON_DST_UNREACHABLE;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ if (likely(orig_node))
+ batadv_orig_node_put(orig_node);
+ if (likely(primary_if))
+ batadv_hardif_put(primary_if);
+
+ return ret;
+}
+
+/**
+ * batadv_tp_handle_out_of_order - store an out of order packet
+ * @tp_vars: the private data of the current TP meter session
+ * @skb: the buffer containing the received packet
+ *
+ * Store the out of order packet in the unacked list for late processing. This
+ * packets are kept in this list so that they can be ACKed at once as soon as
+ * all the previous packets have been received
+ *
+ * Return: true if the packed has been successfully processed, false otherwise
+ */
+static bool batadv_tp_handle_out_of_order(struct batadv_tp_vars *tp_vars,
+ const struct sk_buff *skb)
+{
+ const struct batadv_icmp_tp_packet *icmp;
+ struct batadv_tp_unacked *un, *new;
+ u32 payload_len;
+ bool added = false;
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (unlikely(!new))
+ return false;
+
+ icmp = (struct batadv_icmp_tp_packet *)skb->data;
+
+ new->seqno = ntohl(icmp->seqno);
+ payload_len = skb->len - sizeof(struct batadv_unicast_packet);
+ new->len = payload_len;
+
+ spin_lock_bh(&tp_vars->unacked_lock);
+ /* if the list is empty immediately attach this new object */
+ if (list_empty(&tp_vars->unacked_list)) {
+ list_add(&new->list, &tp_vars->unacked_list);
+ goto out;
+ }
+
+ /* otherwise loop over the list and either drop the packet because this
+ * is a duplicate or store it at the right position.
+ *
+ * The iteration is done in the reverse way because it is likely that
+ * the last received packet (the one being processed now) has a bigger
+ * seqno than all the others already stored.
+ */
+ list_for_each_entry_reverse(un, &tp_vars->unacked_list, list) {
+ /* check for duplicates */
+ if (new->seqno == un->seqno) {
+ if (new->len > un->len)
+ un->len = new->len;
+ kfree(new);
+ added = true;
+ break;
+ }
+
+ /* look for the right position */
+ if (batadv_seq_before(new->seqno, un->seqno))
+ continue;
+
+ /* as soon as an entry having a bigger seqno is found, the new
+ * one is attached _after_ it. In this way the list is kept in
+ * ascending order
+ */
+ list_add_tail(&new->list, &un->list);
+ added = true;
+ break;
+ }
+
+ /* received packet with smallest seqno out of order; add it to front */
+ if (!added)
+ list_add(&new->list, &tp_vars->unacked_list);
+
+out:
+ spin_unlock_bh(&tp_vars->unacked_lock);
+
+ return true;
+}
+
+/**
+ * batadv_tp_ack_unordered - update number received bytes in current stream
+ * without gaps
+ * @tp_vars: the private data of the current TP meter session
+ */
+static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars)
+{
+ struct batadv_tp_unacked *un, *safe;
+ u32 to_ack;
+
+ /* go through the unacked packet list and possibly ACK them as
+ * well
+ */
+ spin_lock_bh(&tp_vars->unacked_lock);
+ list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
+ /* the list is ordered, therefore it is possible to stop as soon
+ * there is a gap between the last acked seqno and the seqno of
+ * the packet under inspection
+ */
+ if (batadv_seq_before(tp_vars->last_recv, un->seqno))
+ break;
+
+ to_ack = un->seqno + un->len - tp_vars->last_recv;
+
+ if (batadv_seq_before(tp_vars->last_recv, un->seqno + un->len))
+ tp_vars->last_recv += to_ack;
+
+ list_del(&un->list);
+ kfree(un);
+ }
+ spin_unlock_bh(&tp_vars->unacked_lock);
+}
+
+/**
+ * batadv_tp_init_recv - return matching or create new receiver tp_vars
+ * @bat_priv: the bat priv with all the soft interface information
+ * @icmp: received icmp tp msg
+ *
+ * Return: corresponding tp_vars or NULL on errors
+ */
+static struct batadv_tp_vars *
+batadv_tp_init_recv(struct batadv_priv *bat_priv,
+ const struct batadv_icmp_tp_packet *icmp)
+{
+ struct batadv_tp_vars *tp_vars;
+
+ spin_lock_bh(&bat_priv->tp_list_lock);
+ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
+ icmp->session);
+ if (tp_vars)
+ goto out_unlock;
+
+ if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: too many ongoing sessions, aborting (RECV)\n");
+ goto out_unlock;
+ }
+
+ tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC);
+ if (!tp_vars)
+ goto out_unlock;
+
+ ether_addr_copy(tp_vars->other_end, icmp->orig);
+ tp_vars->role = BATADV_TP_RECEIVER;
+ memcpy(tp_vars->session, icmp->session, sizeof(tp_vars->session));
+ tp_vars->last_recv = BATADV_TP_FIRST_SEQ;
+ tp_vars->bat_priv = bat_priv;
+ kref_init(&tp_vars->refcount);
+
+ spin_lock_init(&tp_vars->unacked_lock);
+ INIT_LIST_HEAD(&tp_vars->unacked_list);
+
+ kref_get(&tp_vars->refcount);
+ hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list);
+
+ kref_get(&tp_vars->refcount);
+ setup_timer(&tp_vars->timer, batadv_tp_receiver_shutdown,
+ (unsigned long)tp_vars);
+
+ batadv_tp_reset_receiver_timer(tp_vars);
+
+out_unlock:
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+
+ return tp_vars;
+}
+
+/**
+ * batadv_tp_recv_msg - process a single data message
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the buffer containing the received packet
+ *
+ * Process a received TP MSG packet
+ */
+static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
+ const struct sk_buff *skb)
+{
+ const struct batadv_icmp_tp_packet *icmp;
+ struct batadv_tp_vars *tp_vars;
+ size_t packet_size;
+ u32 seqno;
+
+ icmp = (struct batadv_icmp_tp_packet *)skb->data;
+
+ seqno = ntohl(icmp->seqno);
+ /* check if this is the first seqno. This means that if the
+ * first packet is lost, the tp meter does not work anymore!
+ */
+ if (seqno == BATADV_TP_FIRST_SEQ) {
+ tp_vars = batadv_tp_init_recv(bat_priv, icmp);
+ if (!tp_vars) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: seqno != BATADV_TP_FIRST_SEQ cannot initiate connection\n");
+ goto out;
+ }
+ } else {
+ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
+ icmp->session);
+ if (!tp_vars) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Unexpected packet from %pM!\n",
+ icmp->orig);
+ goto out;
+ }
+ }
+
+ if (unlikely(tp_vars->role != BATADV_TP_RECEIVER)) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: dropping packet: not expected (role=%u)\n",
+ tp_vars->role);
+ goto out;
+ }
+
+ tp_vars->last_recv_time = jiffies;
+
+ /* if the packet is a duplicate, it may be the case that an ACK has been
+ * lost. Resend the ACK
+ */
+ if (batadv_seq_before(seqno, tp_vars->last_recv))
+ goto send_ack;
+
+ /* if the packet is out of order enqueue it */
+ if (ntohl(icmp->seqno) != tp_vars->last_recv) {
+ /* exit immediately (and do not send any ACK) if the packet has
+ * not been enqueued correctly
+ */
+ if (!batadv_tp_handle_out_of_order(tp_vars, skb))
+ goto out;
+
+ /* send a duplicate ACK */
+ goto send_ack;
+ }
+
+ /* if everything was fine count the ACKed bytes */
+ packet_size = skb->len - sizeof(struct batadv_unicast_packet);
+ tp_vars->last_recv += packet_size;
+
+ /* check if this ordered message filled a gap.... */
+ batadv_tp_ack_unordered(tp_vars);
+
+send_ack:
+ /* send the ACK. If the received packet was out of order, the ACK that
+ * is going to be sent is a duplicate (the sender will count them and
+ * possibly enter Fast Retransmit as soon as it has reached 3)
+ */
+ batadv_tp_send_ack(bat_priv, icmp->orig, tp_vars->last_recv,
+ icmp->timestamp, icmp->session, icmp->uid);
+out:
+ if (likely(tp_vars))
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_meter_recv - main TP Meter receiving function
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the buffer containing the received packet
+ */
+void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb)
+{
+ struct batadv_icmp_tp_packet *icmp;
+
+ icmp = (struct batadv_icmp_tp_packet *)skb->data;
+
+ switch (icmp->subtype) {
+ case BATADV_TP_MSG:
+ batadv_tp_recv_msg(bat_priv, skb);
+ break;
+ case BATADV_TP_ACK:
+ batadv_tp_recv_ack(bat_priv, skb);
+ break;
+ default:
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Received unknown TP Metric packet type %u\n",
+ icmp->subtype);
+ }
+ consume_skb(skb);
+}
+
+/**
+ * batadv_tp_meter_init - initialize global tp_meter structures
+ */
+void batadv_tp_meter_init(void)
+{
+ get_random_bytes(batadv_tp_prerandom, sizeof(batadv_tp_prerandom));
+}
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
new file mode 100644
index 000000000..ba922c425
--- /dev/null
+++ b/net/batman-adv/tp_meter.h
@@ -0,0 +1,34 @@
+/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+ *
+ * Edo Monticelli, Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_TP_METER_H_
+#define _NET_BATMAN_ADV_TP_METER_H_
+
+#include "main.h"
+
+#include <linux/types.h>
+
+struct sk_buff;
+
+void batadv_tp_meter_init(void);
+void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
+ u32 test_length, u32 *cookie);
+void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst,
+ u8 return_value);
+void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb);
+
+#endif /* _NET_BATMAN_ADV_TP_METER_H_ */
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 57ec87f37..7e6df7a49 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -47,10 +47,12 @@
#include "bridge_loop_avoidance.h"
#include "hard-interface.h"
#include "hash.h"
+#include "log.h"
#include "multicast.h"
#include "originator.h"
#include "packet.h"
#include "soft-interface.h"
+#include "tvlv.h"
/* hash class keys */
static struct lock_class_key batadv_tt_local_hash_lock_class_key;
@@ -996,7 +998,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
struct batadv_tt_local_entry *tt_local;
struct batadv_hard_iface *primary_if;
struct hlist_head *head;
- unsigned short vid;
u32 i;
int last_seen_secs;
int last_seen_msecs;
@@ -1023,7 +1024,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
tt_local = container_of(tt_common_entry,
struct batadv_tt_local_entry,
common);
- vid = tt_common_entry->vid;
last_seen_jiffies = jiffies - tt_local->last_seen;
last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
last_seen_secs = last_seen_msecs / 1000;
@@ -1547,7 +1547,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
struct batadv_tt_global_entry *tt_global_entry)
{
struct batadv_neigh_node *router, *best_router = NULL;
- struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ struct batadv_algo_ops *bao = bat_priv->algo_ops;
struct hlist_head *head;
struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL;
@@ -1559,8 +1559,8 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
continue;
if (best_router &&
- bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT,
- best_router, BATADV_IF_DEFAULT) <= 0) {
+ bao->neigh.cmp(router, BATADV_IF_DEFAULT, best_router,
+ BATADV_IF_DEFAULT) <= 0) {
batadv_neigh_node_put(router);
continue;
}
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
new file mode 100644
index 000000000..3d1cf0fb1
--- /dev/null
+++ b/net/batman-adv/tvlv.c
@@ -0,0 +1,632 @@
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+
+#include <linux/byteorder/generic.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "originator.h"
+#include "packet.h"
+#include "send.h"
+#include "tvlv.h"
+
+/**
+ * batadv_tvlv_handler_release - release tvlv handler from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the tvlv
+ */
+static void batadv_tvlv_handler_release(struct kref *ref)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount);
+ kfree_rcu(tvlv_handler, rcu);
+}
+
+/**
+ * batadv_tvlv_handler_put - decrement the tvlv container refcounter and
+ * possibly release it
+ * @tvlv_handler: the tvlv handler to free
+ */
+static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler)
+{
+ kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release);
+}
+
+/**
+ * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list
+ * based on the provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv handler type to look for
+ * @version: tvlv handler version to look for
+ *
+ * Return: tvlv handler if found or NULL otherwise.
+ */
+static struct batadv_tvlv_handler *
+batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
+{
+ struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tvlv_handler_tmp,
+ &bat_priv->tvlv.handler_list, list) {
+ if (tvlv_handler_tmp->type != type)
+ continue;
+
+ if (tvlv_handler_tmp->version != version)
+ continue;
+
+ if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount))
+ continue;
+
+ tvlv_handler = tvlv_handler_tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tvlv_handler;
+}
+
+/**
+ * batadv_tvlv_container_release - release tvlv from lists and free
+ * @ref: kref pointer of the tvlv
+ */
+static void batadv_tvlv_container_release(struct kref *ref)
+{
+ struct batadv_tvlv_container *tvlv;
+
+ tvlv = container_of(ref, struct batadv_tvlv_container, refcount);
+ kfree(tvlv);
+}
+
+/**
+ * batadv_tvlv_container_put - decrement the tvlv container refcounter and
+ * possibly release it
+ * @tvlv: the tvlv container to free
+ */
+static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv)
+{
+ kref_put(&tvlv->refcount, batadv_tvlv_container_release);
+}
+
+/**
+ * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container
+ * list based on the provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv container type to look for
+ * @version: tvlv container version to look for
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ *
+ * Return: tvlv container if found or NULL otherwise.
+ */
+static struct batadv_tvlv_container *
+batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
+{
+ struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL;
+
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
+ hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) {
+ if (tvlv_tmp->tvlv_hdr.type != type)
+ continue;
+
+ if (tvlv_tmp->tvlv_hdr.version != version)
+ continue;
+
+ kref_get(&tvlv_tmp->refcount);
+ tvlv = tvlv_tmp;
+ break;
+ }
+
+ return tvlv;
+}
+
+/**
+ * batadv_tvlv_container_list_size - calculate the size of the tvlv container
+ * list entries
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ *
+ * Return: size of all currently registered tvlv containers in bytes.
+ */
+static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
+{
+ struct batadv_tvlv_container *tvlv;
+ u16 tvlv_len = 0;
+
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
+ hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
+ tvlv_len += sizeof(struct batadv_tvlv_hdr);
+ tvlv_len += ntohs(tvlv->tvlv_hdr.len);
+ }
+
+ return tvlv_len;
+}
+
+/**
+ * batadv_tvlv_container_remove - remove tvlv container from the tvlv container
+ * list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tvlv: the to be removed tvlv container
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ */
+static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_container *tvlv)
+{
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
+ if (!tvlv)
+ return;
+
+ hlist_del(&tvlv->list);
+
+ /* first call to decrement the counter, second call to free */
+ batadv_tvlv_container_put(tvlv);
+ batadv_tvlv_container_put(tvlv);
+}
+
+/**
+ * batadv_tvlv_container_unregister - unregister tvlv container based on the
+ * provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv container type to unregister
+ * @version: tvlv container type to unregister
+ */
+void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
+ u8 type, u8 version)
+{
+ struct batadv_tvlv_container *tvlv;
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv = batadv_tvlv_container_get(bat_priv, type, version);
+ batadv_tvlv_container_remove(bat_priv, tvlv);
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+}
+
+/**
+ * batadv_tvlv_container_register - register tvlv type, version and content
+ * to be propagated with each (primary interface) OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv container type
+ * @version: tvlv container version
+ * @tvlv_value: tvlv container content
+ * @tvlv_value_len: tvlv container content length
+ *
+ * If a container of the same type and version was already registered the new
+ * content is going to replace the old one.
+ */
+void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
+ u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len)
+{
+ struct batadv_tvlv_container *tvlv_old, *tvlv_new;
+
+ if (!tvlv_value)
+ tvlv_value_len = 0;
+
+ tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC);
+ if (!tvlv_new)
+ return;
+
+ tvlv_new->tvlv_hdr.version = version;
+ tvlv_new->tvlv_hdr.type = type;
+ tvlv_new->tvlv_hdr.len = htons(tvlv_value_len);
+
+ memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len));
+ INIT_HLIST_NODE(&tvlv_new->list);
+ kref_init(&tvlv_new->refcount);
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
+ batadv_tvlv_container_remove(bat_priv, tvlv_old);
+ hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list);
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+}
+
+/**
+ * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate
+ * requested packet size
+ * @packet_buff: packet buffer
+ * @packet_buff_len: packet buffer size
+ * @min_packet_len: requested packet minimum size
+ * @additional_packet_len: requested additional packet size on top of minimum
+ * size
+ *
+ * Return: true of the packet buffer could be changed to the requested size,
+ * false otherwise.
+ */
+static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
+ int *packet_buff_len,
+ int min_packet_len,
+ int additional_packet_len)
+{
+ unsigned char *new_buff;
+
+ new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC);
+
+ /* keep old buffer if kmalloc should fail */
+ if (!new_buff)
+ return false;
+
+ memcpy(new_buff, *packet_buff, min_packet_len);
+ kfree(*packet_buff);
+ *packet_buff = new_buff;
+ *packet_buff_len = min_packet_len + additional_packet_len;
+
+ return true;
+}
+
+/**
+ * batadv_tvlv_container_ogm_append - append tvlv container content to given
+ * OGM packet buffer
+ * @bat_priv: the bat priv with all the soft interface information
+ * @packet_buff: ogm packet buffer
+ * @packet_buff_len: ogm packet buffer size including ogm header and tvlv
+ * content
+ * @packet_min_len: ogm header size to be preserved for the OGM itself
+ *
+ * The ogm packet might be enlarged or shrunk depending on the current size
+ * and the size of the to-be-appended tvlv containers.
+ *
+ * Return: size of all appended tvlv containers in bytes.
+ */
+u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+ unsigned char **packet_buff,
+ int *packet_buff_len, int packet_min_len)
+{
+ struct batadv_tvlv_container *tvlv;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ u16 tvlv_value_len;
+ void *tvlv_value;
+ bool ret;
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv_value_len = batadv_tvlv_container_list_size(bat_priv);
+
+ ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len,
+ packet_min_len, tvlv_value_len);
+
+ if (!ret)
+ goto end;
+
+ if (!tvlv_value_len)
+ goto end;
+
+ tvlv_value = (*packet_buff) + packet_min_len;
+
+ hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
+ tvlv_hdr = tvlv_value;
+ tvlv_hdr->type = tvlv->tvlv_hdr.type;
+ tvlv_hdr->version = tvlv->tvlv_hdr.version;
+ tvlv_hdr->len = tvlv->tvlv_hdr.len;
+ tvlv_value = tvlv_hdr + 1;
+ memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len));
+ tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len);
+ }
+
+end:
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+ return tvlv_value_len;
+}
+
+/**
+ * batadv_tvlv_call_handler - parse the given tvlv buffer to call the
+ * appropriate handlers
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tvlv_handler: tvlv callback function handling the tvlv content
+ * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
+ * @orig_node: orig node emitting the ogm packet
+ * @src: source mac address of the unicast packet
+ * @dst: destination mac address of the unicast packet
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ *
+ * Return: success if handler was not found or the return value of the handler
+ * callback.
+ */
+static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_handler *tvlv_handler,
+ bool ogm_source,
+ struct batadv_orig_node *orig_node,
+ u8 *src, u8 *dst,
+ void *tvlv_value, u16 tvlv_value_len)
+{
+ if (!tvlv_handler)
+ return NET_RX_SUCCESS;
+
+ if (ogm_source) {
+ if (!tvlv_handler->ogm_handler)
+ return NET_RX_SUCCESS;
+
+ if (!orig_node)
+ return NET_RX_SUCCESS;
+
+ tvlv_handler->ogm_handler(bat_priv, orig_node,
+ BATADV_NO_FLAGS,
+ tvlv_value, tvlv_value_len);
+ tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED;
+ } else {
+ if (!src)
+ return NET_RX_SUCCESS;
+
+ if (!dst)
+ return NET_RX_SUCCESS;
+
+ if (!tvlv_handler->unicast_handler)
+ return NET_RX_SUCCESS;
+
+ return tvlv_handler->unicast_handler(bat_priv, src,
+ dst, tvlv_value,
+ tvlv_value_len);
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_tvlv_containers_process - parse the given tvlv buffer to call the
+ * appropriate handlers
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
+ * @orig_node: orig node emitting the ogm packet
+ * @src: source mac address of the unicast packet
+ * @dst: destination mac address of the unicast packet
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ *
+ * Return: success when processing an OGM or the return value of all called
+ * handler callbacks.
+ */
+int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
+ bool ogm_source,
+ struct batadv_orig_node *orig_node,
+ u8 *src, u8 *dst,
+ void *tvlv_value, u16 tvlv_value_len)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ u16 tvlv_value_cont_len;
+ u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND;
+ int ret = NET_RX_SUCCESS;
+
+ while (tvlv_value_len >= sizeof(*tvlv_hdr)) {
+ tvlv_hdr = tvlv_value;
+ tvlv_value_cont_len = ntohs(tvlv_hdr->len);
+ tvlv_value = tvlv_hdr + 1;
+ tvlv_value_len -= sizeof(*tvlv_hdr);
+
+ if (tvlv_value_cont_len > tvlv_value_len)
+ break;
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv,
+ tvlv_hdr->type,
+ tvlv_hdr->version);
+
+ ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler,
+ ogm_source, orig_node,
+ src, dst, tvlv_value,
+ tvlv_value_cont_len);
+ if (tvlv_handler)
+ batadv_tvlv_handler_put(tvlv_handler);
+ tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len;
+ tvlv_value_len -= tvlv_value_cont_len;
+ }
+
+ if (!ogm_source)
+ return ret;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tvlv_handler,
+ &bat_priv->tvlv.handler_list, list) {
+ if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) &&
+ !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED))
+ tvlv_handler->ogm_handler(bat_priv, orig_node,
+ cifnotfound, NULL, 0);
+
+ tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED;
+ }
+ rcu_read_unlock();
+
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate
+ * handlers
+ * @bat_priv: the bat priv with all the soft interface information
+ * @batadv_ogm_packet: ogm packet containing the tvlv containers
+ * @orig_node: orig node emitting the ogm packet
+ */
+void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_orig_node *orig_node)
+{
+ void *tvlv_value;
+ u16 tvlv_value_len;
+
+ if (!batadv_ogm_packet)
+ return;
+
+ tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len);
+ if (!tvlv_value_len)
+ return;
+
+ tvlv_value = batadv_ogm_packet + 1;
+
+ batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL,
+ tvlv_value, tvlv_value_len);
+}
+
+/**
+ * batadv_tvlv_handler_register - register tvlv handler based on the provided
+ * type and version (both need to match) for ogm tvlv payload and/or unicast
+ * payload
+ * @bat_priv: the bat priv with all the soft interface information
+ * @optr: ogm tvlv handler callback function. This function receives the orig
+ * node, flags and the tvlv content as argument to process.
+ * @uptr: unicast tvlv handler callback function. This function receives the
+ * source & destination of the unicast packet as well as the tvlv content
+ * to process.
+ * @type: tvlv handler type to be registered
+ * @version: tvlv handler version to be registered
+ * @flags: flags to enable or disable TVLV API behavior
+ */
+void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
+ void (*optr)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags,
+ void *tvlv_value,
+ u16 tvlv_value_len),
+ int (*uptr)(struct batadv_priv *bat_priv,
+ u8 *src, u8 *dst,
+ void *tvlv_value,
+ u16 tvlv_value_len),
+ u8 type, u8 version, u8 flags)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
+ if (tvlv_handler) {
+ batadv_tvlv_handler_put(tvlv_handler);
+ return;
+ }
+
+ tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC);
+ if (!tvlv_handler)
+ return;
+
+ tvlv_handler->ogm_handler = optr;
+ tvlv_handler->unicast_handler = uptr;
+ tvlv_handler->type = type;
+ tvlv_handler->version = version;
+ tvlv_handler->flags = flags;
+ kref_init(&tvlv_handler->refcount);
+ INIT_HLIST_NODE(&tvlv_handler->list);
+
+ spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
+ hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list);
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+}
+
+/**
+ * batadv_tvlv_handler_unregister - unregister tvlv handler based on the
+ * provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv handler type to be unregistered
+ * @version: tvlv handler version to be unregistered
+ */
+void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
+ u8 type, u8 version)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
+ if (!tvlv_handler)
+ return;
+
+ batadv_tvlv_handler_put(tvlv_handler);
+ spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
+ hlist_del_rcu(&tvlv_handler->list);
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+ batadv_tvlv_handler_put(tvlv_handler);
+}
+
+/**
+ * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the
+ * specified host
+ * @bat_priv: the bat priv with all the soft interface information
+ * @src: source mac address of the unicast packet
+ * @dst: destination mac address of the unicast packet
+ * @type: tvlv type
+ * @version: tvlv version
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ */
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
+ u8 *dst, u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len)
+{
+ struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ struct batadv_orig_node *orig_node;
+ struct sk_buff *skb;
+ unsigned char *tvlv_buff;
+ unsigned int tvlv_len;
+ ssize_t hdr_len = sizeof(*unicast_tvlv_packet);
+ int res;
+
+ orig_node = batadv_orig_hash_find(bat_priv, dst);
+ if (!orig_node)
+ return;
+
+ tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len;
+
+ skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len);
+ if (!skb)
+ goto out;
+
+ skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(skb, ETH_HLEN);
+ tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len);
+ unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff;
+ unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV;
+ unicast_tvlv_packet->version = BATADV_COMPAT_VERSION;
+ unicast_tvlv_packet->ttl = BATADV_TTL;
+ unicast_tvlv_packet->reserved = 0;
+ unicast_tvlv_packet->tvlv_len = htons(tvlv_len);
+ unicast_tvlv_packet->align = 0;
+ ether_addr_copy(unicast_tvlv_packet->src, src);
+ ether_addr_copy(unicast_tvlv_packet->dst, dst);
+
+ tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1);
+ tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff;
+ tvlv_hdr->version = version;
+ tvlv_hdr->type = type;
+ tvlv_hdr->len = htons(tvlv_value_len);
+ tvlv_buff += sizeof(*tvlv_hdr);
+ memcpy(tvlv_buff, tvlv_value, tvlv_value_len);
+
+ res = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (res == -1)
+ kfree_skb(skb);
+out:
+ batadv_orig_node_put(orig_node);
+}
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
new file mode 100644
index 000000000..e4369b547
--- /dev/null
+++ b/net/batman-adv/tvlv.h
@@ -0,0 +1,61 @@
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_TVLV_H_
+#define _NET_BATMAN_ADV_TVLV_H_
+
+#include "main.h"
+
+#include <linux/types.h>
+
+struct batadv_ogm_packet;
+
+void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
+ u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len);
+u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+ unsigned char **packet_buff,
+ int *packet_buff_len, int packet_min_len);
+void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_orig_node *orig_node);
+void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
+ u8 type, u8 version);
+
+void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
+ void (*optr)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags,
+ void *tvlv_value,
+ u16 tvlv_value_len),
+ int (*uptr)(struct batadv_priv *bat_priv,
+ u8 *src, u8 *dst,
+ void *tvlv_value,
+ u16 tvlv_value_len),
+ u8 type, u8 version, u8 flags);
+void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
+ u8 type, u8 version);
+int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
+ bool ogm_source,
+ struct batadv_orig_node *orig_node,
+ u8 *src, u8 *dst,
+ void *tvlv_buff, u16 tvlv_buff_len);
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
+ u8 *dst, u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len);
+
+#endif /* _NET_BATMAN_ADV_TVLV_H_ */
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 74d865a4d..a64522c3b 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -33,6 +33,7 @@
#include <linux/types.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <uapi/linux/batman_adv.h>
#include "packet.h"
@@ -709,6 +710,8 @@ struct batadv_priv_debug_log {
* @list: list of available gateway nodes
* @list_lock: lock protecting gw_list & curr_gw
* @curr_gw: pointer to currently selected gateway node
+ * @mode: gateway operation: off, client or server (see batadv_gw_modes)
+ * @sel_class: gateway selection class (applies if gw_mode client)
* @bandwidth_down: advertised uplink download bandwidth (if gw_mode server)
* @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server)
* @reselect: bool indicating a gateway re-selection is in progress
@@ -717,6 +720,8 @@ struct batadv_priv_gw {
struct hlist_head list;
spinlock_t list_lock; /* protects gw_list & curr_gw */
struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */
+ atomic_t mode;
+ atomic_t sel_class;
atomic_t bandwidth_down;
atomic_t bandwidth_up;
atomic_t reselect;
@@ -753,14 +758,28 @@ struct batadv_priv_dat {
#ifdef CONFIG_BATMAN_ADV_MCAST
/**
+ * struct batadv_mcast_querier_state - IGMP/MLD querier state when bridged
+ * @exists: whether a querier exists in the mesh
+ * @shadowing: if a querier exists, whether it is potentially shadowing
+ * multicast listeners (i.e. querier is behind our own bridge segment)
+ */
+struct batadv_mcast_querier_state {
+ bool exists;
+ bool shadowing;
+};
+
+/**
* struct batadv_priv_mcast - per mesh interface mcast data
* @mla_list: list of multicast addresses we are currently announcing via TT
* @want_all_unsnoopables_list: a list of orig_nodes wanting all unsnoopable
* multicast traffic
* @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast traffic
* @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic
+ * @querier_ipv4: the current state of an IGMP querier in the mesh
+ * @querier_ipv6: the current state of an MLD querier in the mesh
* @flags: the flags we have last sent in our mcast tvlv
* @enabled: whether the multicast tvlv is currently enabled
+ * @bridged: whether the soft interface has a bridge on top
* @num_disabled: number of nodes that have no mcast tvlv
* @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP traffic
* @num_want_all_ipv4: counter for items in want_all_ipv4_list
@@ -773,8 +792,11 @@ struct batadv_priv_mcast {
struct hlist_head want_all_unsnoopables_list;
struct hlist_head want_all_ipv4_list;
struct hlist_head want_all_ipv6_list;
+ struct batadv_mcast_querier_state querier_ipv4;
+ struct batadv_mcast_querier_state querier_ipv6;
u8 flags;
bool enabled;
+ bool bridged;
atomic_t num_disabled;
atomic_t num_want_all_unsnoopables;
atomic_t num_want_all_ipv4;
@@ -814,6 +836,111 @@ struct batadv_priv_nc {
};
/**
+ * struct batadv_tp_unacked - unacked packet meta-information
+ * @seqno: seqno of the unacked packet
+ * @len: length of the packet
+ * @list: list node for batadv_tp_vars::unacked_list
+ *
+ * This struct is supposed to represent a buffer unacked packet. However, since
+ * the purpose of the TP meter is to count the traffic only, there is no need to
+ * store the entire sk_buff, the starting offset and the length are enough
+ */
+struct batadv_tp_unacked {
+ u32 seqno;
+ u16 len;
+ struct list_head list;
+};
+
+/**
+ * enum batadv_tp_meter_role - Modus in tp meter session
+ * @BATADV_TP_RECEIVER: Initialized as receiver
+ * @BATADV_TP_SENDER: Initialized as sender
+ */
+enum batadv_tp_meter_role {
+ BATADV_TP_RECEIVER,
+ BATADV_TP_SENDER
+};
+
+/**
+ * struct batadv_tp_vars - tp meter private variables per session
+ * @list: list node for bat_priv::tp_list
+ * @timer: timer for ack (receiver) and retry (sender)
+ * @bat_priv: pointer to the mesh object
+ * @start_time: start time in jiffies
+ * @other_end: mac address of remote
+ * @role: receiver/sender modi
+ * @sending: sending binary semaphore: 1 if sending, 0 is not
+ * @reason: reason for a stopped session
+ * @finish_work: work item for the finishing procedure
+ * @test_length: test length in milliseconds
+ * @session: TP session identifier
+ * @icmp_uid: local ICMP "socket" index
+ * @dec_cwnd: decimal part of the cwnd used during linear growth
+ * @cwnd: current size of the congestion window
+ * @cwnd_lock: lock do protect @cwnd & @dec_cwnd
+ * @ss_threshold: Slow Start threshold. Once cwnd exceeds this value the
+ * connection switches to the Congestion Avoidance state
+ * @last_acked: last acked byte
+ * @last_sent: last sent byte, not yet acked
+ * @tot_sent: amount of data sent/ACKed so far
+ * @dup_acks: duplicate ACKs counter
+ * @fast_recovery: true if in Fast Recovery mode
+ * @recover: last sent seqno when entering Fast Recovery
+ * @rto: sender timeout
+ * @srtt: smoothed RTT scaled by 2^3
+ * @rttvar: RTT variation scaled by 2^2
+ * @more_bytes: waiting queue anchor when waiting for more ack/retry timeout
+ * @prerandom_offset: offset inside the prerandom buffer
+ * @prerandom_lock: spinlock protecting access to prerandom_offset
+ * @last_recv: last in-order received packet
+ * @unacked_list: list of unacked packets (meta-info only)
+ * @unacked_lock: protect unacked_list
+ * @last_recv_time: time time (jiffies) a msg was received
+ * @refcount: number of context where the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+struct batadv_tp_vars {
+ struct hlist_node list;
+ struct timer_list timer;
+ struct batadv_priv *bat_priv;
+ unsigned long start_time;
+ u8 other_end[ETH_ALEN];
+ enum batadv_tp_meter_role role;
+ atomic_t sending;
+ enum batadv_tp_meter_reason reason;
+ struct delayed_work finish_work;
+ u32 test_length;
+ u8 session[2];
+ u8 icmp_uid;
+
+ /* sender variables */
+ u16 dec_cwnd;
+ u32 cwnd;
+ spinlock_t cwnd_lock; /* Protects cwnd & dec_cwnd */
+ u32 ss_threshold;
+ atomic_t last_acked;
+ u32 last_sent;
+ atomic64_t tot_sent;
+ atomic_t dup_acks;
+ bool fast_recovery;
+ u32 recover;
+ u32 rto;
+ u32 srtt;
+ u32 rttvar;
+ wait_queue_head_t more_bytes;
+ u32 prerandom_offset;
+ spinlock_t prerandom_lock; /* Protects prerandom_offset */
+
+ /* receiver variables */
+ u32 last_recv;
+ struct list_head unacked_list;
+ spinlock_t unacked_lock; /* Protects unacked_list */
+ unsigned long last_recv_time;
+ struct kref refcount;
+ struct rcu_head rcu;
+};
+
+/**
* struct batadv_softif_vlan - per VLAN attributes set
* @bat_priv: pointer to the mesh object
* @vid: VLAN identifier
@@ -867,8 +994,6 @@ struct batadv_priv_bat_v {
* enabled
* @multicast_mode: Enable or disable multicast optimizations on this node's
* sender/originating side
- * @gw_mode: gateway operation: off, client or server (see batadv_gw_modes)
- * @gw_sel_class: gateway selection class (applies if gw_mode client)
* @orig_interval: OGM broadcast interval in milliseconds
* @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop
* @log_level: configured log level (see batadv_dbg_level)
@@ -883,14 +1008,17 @@ struct batadv_priv_bat_v {
* @debug_dir: dentry for debugfs batman-adv subdirectory
* @forw_bat_list: list of aggregated OGMs that will be forwarded
* @forw_bcast_list: list of broadcast packets that will be rebroadcasted
+ * @tp_list: list of tp sessions
+ * @tp_num: number of currently active tp sessions
* @orig_hash: hash table containing mesh participants (orig nodes)
* @forw_bat_list_lock: lock protecting forw_bat_list
* @forw_bcast_list_lock: lock protecting forw_bcast_list
+ * @tp_list_lock: spinlock protecting @tp_list
* @orig_work: work queue callback item for orig node purging
* @cleanup_work: work queue callback item for soft-interface deinit
* @primary_if: one of the hard-interfaces assigned to this mesh interface
* becomes the primary interface
- * @bat_algo_ops: routing algorithm used by this mesh interface
+ * @algo_ops: routing algorithm used by this mesh interface
* @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top
* of the mesh interface represented by this object
* @softif_vlan_list_lock: lock protecting softif_vlan_list
@@ -924,8 +1052,6 @@ struct batadv_priv {
#ifdef CONFIG_BATMAN_ADV_MCAST
atomic_t multicast_mode;
#endif
- atomic_t gw_mode;
- atomic_t gw_sel_class;
atomic_t orig_interval;
atomic_t hop_penalty;
#ifdef CONFIG_BATMAN_ADV_DEBUG
@@ -941,13 +1067,16 @@ struct batadv_priv {
struct dentry *debug_dir;
struct hlist_head forw_bat_list;
struct hlist_head forw_bcast_list;
+ struct hlist_head tp_list;
struct batadv_hashtable *orig_hash;
spinlock_t forw_bat_list_lock; /* protects forw_bat_list */
spinlock_t forw_bcast_list_lock; /* protects forw_bcast_list */
+ spinlock_t tp_list_lock; /* protects tp_list */
+ atomic_t tp_num;
struct delayed_work orig_work;
struct work_struct cleanup_work;
struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */
- struct batadv_algo_ops *bat_algo_ops;
+ struct batadv_algo_ops *algo_ops;
struct hlist_head softif_vlan_list;
spinlock_t softif_vlan_list_lock; /* protects softif_vlan_list */
#ifdef CONFIG_BATMAN_ADV_BLA
@@ -1265,66 +1394,77 @@ struct batadv_forw_packet {
};
/**
+ * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific)
+ * @activate: start routing mechanisms when hard-interface is brought up
+ * @enable: init routing info when hard-interface is enabled
+ * @disable: de-init routing info when hard-interface is disabled
+ * @update_mac: (re-)init mac addresses of the protocol information
+ * belonging to this hard-interface
+ * @primary_set: called when primary interface is selected / changed
+ */
+struct batadv_algo_iface_ops {
+ void (*activate)(struct batadv_hard_iface *hard_iface);
+ int (*enable)(struct batadv_hard_iface *hard_iface);
+ void (*disable)(struct batadv_hard_iface *hard_iface);
+ void (*update_mac)(struct batadv_hard_iface *hard_iface);
+ void (*primary_set)(struct batadv_hard_iface *hard_iface);
+};
+
+/**
+ * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific)
+ * @hardif_init: called on creation of single hop entry
+ * @cmp: compare the metrics of two neighbors for their respective outgoing
+ * interfaces
+ * @is_similar_or_better: check if neigh1 is equally similar or better than
+ * neigh2 for their respective outgoing interface from the metric prospective
+ * @print: print the single hop neighbor list (optional)
+ */
+struct batadv_algo_neigh_ops {
+ void (*hardif_init)(struct batadv_hardif_neigh_node *neigh);
+ int (*cmp)(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2);
+ bool (*is_similar_or_better)(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2);
+ void (*print)(struct batadv_priv *priv, struct seq_file *seq);
+};
+
+/**
+ * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
+ * @free: free the resources allocated by the routing algorithm for an orig_node
+ * object
+ * @add_if: ask the routing algorithm to apply the needed changes to the
+ * orig_node due to a new hard-interface being added into the mesh
+ * @del_if: ask the routing algorithm to apply the needed changes to the
+ * orig_node due to an hard-interface being removed from the mesh
+ * @print: print the originator table (optional)
+ */
+struct batadv_algo_orig_ops {
+ void (*free)(struct batadv_orig_node *orig_node);
+ int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num);
+ int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num,
+ int del_if_num);
+ void (*print)(struct batadv_priv *priv, struct seq_file *seq,
+ struct batadv_hard_iface *hard_iface);
+};
+
+/**
* struct batadv_algo_ops - mesh algorithm callbacks
* @list: list node for the batadv_algo_list
* @name: name of the algorithm
- * @bat_iface_activate: start routing mechanisms when hard-interface is brought
- * up
- * @bat_iface_enable: init routing info when hard-interface is enabled
- * @bat_iface_disable: de-init routing info when hard-interface is disabled
- * @bat_iface_update_mac: (re-)init mac addresses of the protocol information
- * belonging to this hard-interface
- * @bat_primary_iface_set: called when primary interface is selected / changed
- * @bat_ogm_schedule: prepare a new outgoing OGM for the send queue
- * @bat_ogm_emit: send scheduled OGM
- * @bat_hardif_neigh_init: called on creation of single hop entry
- * @bat_neigh_cmp: compare the metrics of two neighbors for their respective
- * outgoing interfaces
- * @bat_neigh_is_similar_or_better: check if neigh1 is equally similar or
- * better than neigh2 for their respective outgoing interface from the metric
- * prospective
- * @bat_neigh_print: print the single hop neighbor list (optional)
- * @bat_neigh_free: free the resources allocated by the routing algorithm for a
- * neigh_node object
- * @bat_orig_print: print the originator table (optional)
- * @bat_orig_free: free the resources allocated by the routing algorithm for an
- * orig_node object
- * @bat_orig_add_if: ask the routing algorithm to apply the needed changes to
- * the orig_node due to a new hard-interface being added into the mesh
- * @bat_orig_del_if: ask the routing algorithm to apply the needed changes to
- * the orig_node due to an hard-interface being removed from the mesh
+ * @iface: callbacks related to interface handling
+ * @neigh: callbacks related to neighbors handling
+ * @orig: callbacks related to originators handling
*/
struct batadv_algo_ops {
struct hlist_node list;
char *name;
- void (*bat_iface_activate)(struct batadv_hard_iface *hard_iface);
- int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface);
- void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface);
- void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface);
- void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface);
- void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface);
- void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet);
- /* neigh_node handling API */
- void (*bat_hardif_neigh_init)(struct batadv_hardif_neigh_node *neigh);
- int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1,
- struct batadv_hard_iface *if_outgoing1,
- struct batadv_neigh_node *neigh2,
- struct batadv_hard_iface *if_outgoing2);
- bool (*bat_neigh_is_similar_or_better)
- (struct batadv_neigh_node *neigh1,
- struct batadv_hard_iface *if_outgoing1,
- struct batadv_neigh_node *neigh2,
- struct batadv_hard_iface *if_outgoing2);
- void (*bat_neigh_print)(struct batadv_priv *priv, struct seq_file *seq);
- void (*bat_neigh_free)(struct batadv_neigh_node *neigh);
- /* orig_node handling API */
- void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq,
- struct batadv_hard_iface *hard_iface);
- void (*bat_orig_free)(struct batadv_orig_node *orig_node);
- int (*bat_orig_add_if)(struct batadv_orig_node *orig_node,
- int max_if_num);
- int (*bat_orig_del_if)(struct batadv_orig_node *orig_node,
- int max_if_num, int del_if_num);
+ struct batadv_algo_iface_ops iface;
+ struct batadv_algo_neigh_ops neigh;
+ struct batadv_algo_orig_ops orig;
};
/**
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 780089d75..d020299ba 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -627,20 +627,9 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev)
return err < 0 ? NET_XMIT_DROP : err;
}
-static struct lock_class_key bt_tx_busylock;
-static struct lock_class_key bt_netdev_xmit_lock_key;
-
-static void bt_set_lockdep_class_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_unused)
-{
- lockdep_set_class(&txq->_xmit_lock, &bt_netdev_xmit_lock_key);
-}
-
static int bt_dev_init(struct net_device *dev)
{
- netdev_for_each_tx_queue(dev, bt_set_lockdep_class_one, NULL);
- dev->qdisc_tx_busylock = &bt_tx_busylock;
+ netdev_lockdep_set_classes(dev);
return 0;
}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 3df7aefb7..0b5f729d0 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -215,6 +215,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
struct sock *sk = sock->sk;
struct sk_buff *skb;
size_t copied;
+ size_t skblen;
int err;
BT_DBG("sock %p sk %p len %zu", sock, sk, len);
@@ -230,6 +231,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
return err;
}
+ skblen = skb->len;
copied = skb->len;
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
@@ -248,6 +250,9 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
skb_free_datagram(sk, skb);
+ if (flags & MSG_TRUNC)
+ copied = skblen;
+
return err ? : copied;
}
EXPORT_SYMBOL(bt_sock_recvmsg);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index bf9f8a801..3809617aa 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -625,7 +625,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
list_for_each_entry(d, &hci_dev_list, list) {
if (!test_bit(HCI_UP, &d->flags) ||
hci_dev_test_flag(d, HCI_USER_CHANNEL) ||
- d->dev_type != HCI_BREDR)
+ d->dev_type != HCI_PRIMARY)
continue;
/* Simple routing:
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 45a9fc68c..ddf8432fe 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -260,14 +260,12 @@ static int hci_init1_req(struct hci_request *req, unsigned long opt)
hci_reset_req(req, 0);
switch (hdev->dev_type) {
- case HCI_BREDR:
+ case HCI_PRIMARY:
bredr_init(req);
break;
-
case HCI_AMP:
amp_init1(req);
break;
-
default:
BT_ERR("Unknown device type %d", hdev->dev_type);
break;
@@ -791,11 +789,11 @@ static int __hci_init(struct hci_dev *hdev)
if (err < 0)
return err;
- /* HCI_BREDR covers both single-mode LE, BR/EDR and dual-mode
+ /* HCI_PRIMARY covers both single-mode LE, BR/EDR and dual-mode
* BR/EDR/LE type controllers. AMP controllers only need the
* first two stages of init.
*/
- if (hdev->dev_type != HCI_BREDR)
+ if (hdev->dev_type != HCI_PRIMARY)
return 0;
err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT, NULL);
@@ -1202,7 +1200,7 @@ int hci_inquiry(void __user *arg)
goto done;
}
- if (hdev->dev_type != HCI_BREDR) {
+ if (hdev->dev_type != HCI_PRIMARY) {
err = -EOPNOTSUPP;
goto done;
}
@@ -1307,7 +1305,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
* since AMP controllers do not have an address.
*/
if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
- hdev->dev_type == HCI_BREDR &&
+ hdev->dev_type == HCI_PRIMARY &&
!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
!bacmp(&hdev->static_addr, BDADDR_ANY)) {
ret = -EADDRNOTAVAIL;
@@ -1402,7 +1400,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
hci_dev_test_flag(hdev, HCI_MGMT) &&
- hdev->dev_type == HCI_BREDR) {
+ hdev->dev_type == HCI_PRIMARY) {
ret = __hci_req_hci_power_on(hdev);
mgmt_power_on(hdev, ret);
}
@@ -1563,7 +1561,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
- if (!auto_off && hdev->dev_type == HCI_BREDR &&
+ if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
hci_dev_test_flag(hdev, HCI_MGMT))
__mgmt_power_off(hdev);
@@ -1802,7 +1800,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
goto done;
}
- if (hdev->dev_type != HCI_BREDR) {
+ if (hdev->dev_type != HCI_PRIMARY) {
err = -EOPNOTSUPP;
goto done;
}
@@ -2043,7 +2041,7 @@ static void hci_power_on(struct work_struct *work)
*/
if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
- (hdev->dev_type == HCI_BREDR &&
+ (hdev->dev_type == HCI_PRIMARY &&
!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
!bacmp(&hdev->static_addr, BDADDR_ANY))) {
hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
@@ -3030,7 +3028,7 @@ int hci_register_dev(struct hci_dev *hdev)
* so the index can be used as the AMP controller ID.
*/
switch (hdev->dev_type) {
- case HCI_BREDR:
+ case HCI_PRIMARY:
id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL);
break;
case HCI_AMP:
@@ -3090,7 +3088,7 @@ int hci_register_dev(struct hci_dev *hdev)
hci_dev_set_flag(hdev, HCI_SETUP);
hci_dev_set_flag(hdev, HCI_AUTO_OFF);
- if (hdev->dev_type == HCI_BREDR) {
+ if (hdev->dev_type == HCI_PRIMARY) {
/* Assume BR/EDR support until proven otherwise (such as
* through reading supported features during init.
*/
@@ -3165,6 +3163,8 @@ void hci_unregister_dev(struct hci_dev *hdev)
device_del(&hdev->dev);
debugfs_remove_recursive(hdev->debugfs);
+ kfree_const(hdev->hw_info);
+ kfree_const(hdev->fw_info);
destroy_workqueue(hdev->workqueue);
destroy_workqueue(hdev->req_workqueue);
@@ -3268,6 +3268,28 @@ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb)
}
EXPORT_SYMBOL(hci_recv_diag);
+void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...)
+{
+ va_list vargs;
+
+ va_start(vargs, fmt);
+ kfree_const(hdev->hw_info);
+ hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs);
+ va_end(vargs);
+}
+EXPORT_SYMBOL(hci_set_hw_info);
+
+void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...)
+{
+ va_list vargs;
+
+ va_start(vargs, fmt);
+ kfree_const(hdev->fw_info);
+ hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs);
+ va_end(vargs);
+}
+EXPORT_SYMBOL(hci_set_fw_info);
+
/* ---- Interface to upper protocols ---- */
int hci_register_cb(struct hci_cb *cb)
@@ -3415,7 +3437,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT;
switch (hdev->dev_type) {
- case HCI_BREDR:
+ case HCI_PRIMARY:
hci_add_acl_hdr(skb, conn->handle, flags);
break;
case HCI_AMP:
@@ -3826,7 +3848,7 @@ static void hci_sched_acl(struct hci_dev *hdev)
BT_DBG("%s", hdev->name);
/* No ACL link over BR/EDR controller */
- if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_BREDR)
+ if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY)
return;
/* No AMP link over AMP controller */
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 7db422094..63df63ebf 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -76,6 +76,30 @@ static const struct file_operations __name ## _fops = { \
.llseek = default_llseek, \
} \
+#define DEFINE_INFO_ATTRIBUTE(__name, __field) \
+static int __name ## _show(struct seq_file *f, void *ptr) \
+{ \
+ struct hci_dev *hdev = f->private; \
+ \
+ hci_dev_lock(hdev); \
+ seq_printf(f, "%s\n", hdev->__field ? : ""); \
+ hci_dev_unlock(hdev); \
+ \
+ return 0; \
+} \
+ \
+static int __name ## _open(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, __name ## _show, inode->i_private); \
+} \
+ \
+static const struct file_operations __name ## _fops = { \
+ .open = __name ## _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+} \
+
static int features_show(struct seq_file *f, void *ptr)
{
struct hci_dev *hdev = f->private;
@@ -349,6 +373,9 @@ static const struct file_operations sc_only_mode_fops = {
.llseek = default_llseek,
};
+DEFINE_INFO_ATTRIBUTE(hardware_info, hw_info);
+DEFINE_INFO_ATTRIBUTE(firmware_info, fw_info);
+
void hci_debugfs_create_common(struct hci_dev *hdev)
{
debugfs_create_file("features", 0444, hdev->debugfs, hdev,
@@ -382,6 +409,14 @@ void hci_debugfs_create_common(struct hci_dev *hdev)
if (lmp_sc_capable(hdev) || lmp_le_capable(hdev))
debugfs_create_file("sc_only_mode", 0444, hdev->debugfs,
hdev, &sc_only_mode_fops);
+
+ if (hdev->hw_info)
+ debugfs_create_file("hardware_info", 0444, hdev->debugfs,
+ hdev, &hardware_info_fops);
+
+ if (hdev->fw_info)
+ debugfs_create_file("firmware_info", 0444, hdev->debugfs,
+ hdev, &firmware_info_fops);
}
static int inquiry_cache_show(struct seq_file *f, void *p)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index d4b3dd541..e17aacbc5 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2332,7 +2332,7 @@ static u8 hci_to_mgmt_reason(u8 err)
static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_ev_disconn_complete *ev = (void *) skb->data;
- u8 reason = hci_to_mgmt_reason(ev->reason);
+ u8 reason;
struct hci_conn_params *params;
struct hci_conn *conn;
bool mgmt_connected;
@@ -2355,6 +2355,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn->state = BT_CLOSED;
mgmt_connected = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags);
+
+ if (test_bit(HCI_CONN_AUTH_FAILURE, &conn->flags))
+ reason = MGMT_DEV_DISCONN_AUTH_FAILURE;
+ else
+ reason = hci_to_mgmt_reason(ev->reason);
+
mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type,
reason, mgmt_connected);
@@ -2421,6 +2427,8 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
goto unlock;
if (!ev->status) {
+ clear_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+
if (!hci_conn_ssp_enabled(conn) &&
test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) {
BT_INFO("re-auth of legacy device is not possible.");
@@ -2429,6 +2437,9 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn->sec_level = conn->pending_sec_level;
}
} else {
+ if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING)
+ set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+
mgmt_auth_failed(conn, ev->status);
}
@@ -2613,6 +2624,9 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
if (ev->status && conn->state == BT_CONNECTED) {
+ if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING)
+ set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+
hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
hci_conn_drop(conn);
goto unlock;
@@ -3249,7 +3263,7 @@ static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev,
struct hci_chan *chan;
switch (hdev->dev_type) {
- case HCI_BREDR:
+ case HCI_PRIMARY:
return hci_conn_hash_lookup_handle(hdev, handle);
case HCI_AMP:
chan = hci_chan_lookup_handle(hdev, handle);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index c045b3c54..b0e23dfc5 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -262,6 +262,8 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
break;
}
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = NULL;
hdev->req_status = hdev->req_result = 0;
BT_DBG("%s end: err %d", hdev->name, err);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1298d723c..96f04b7b9 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -676,7 +676,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
return -EOPNOTSUPP;
- if (hdev->dev_type != HCI_BREDR)
+ if (hdev->dev_type != HCI_PRIMARY)
return -EOPNOTSUPP;
switch (cmd) {
@@ -1048,6 +1048,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
struct sock *sk = sock->sk;
struct sk_buff *skb;
int copied, err;
+ unsigned int skblen;
BT_DBG("sock %p, sk %p", sock, sk);
@@ -1064,6 +1065,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
if (!skb)
return err;
+ skblen = skb->len;
copied = skb->len;
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
@@ -1089,6 +1091,9 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
skb_free_datagram(sk, skb);
+ if (flags & MSG_TRUNC)
+ copied = skblen;
+
return err ? : copied;
}
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 555982a78..ca7a35eba 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -7,50 +7,6 @@
static struct class *bt_class;
-static inline char *link_typetostr(int type)
-{
- switch (type) {
- case ACL_LINK:
- return "ACL";
- case SCO_LINK:
- return "SCO";
- case ESCO_LINK:
- return "eSCO";
- case LE_LINK:
- return "LE";
- default:
- return "UNKNOWN";
- }
-}
-
-static ssize_t show_link_type(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hci_conn *conn = to_hci_conn(dev);
- return sprintf(buf, "%s\n", link_typetostr(conn->type));
-}
-
-static ssize_t show_link_address(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hci_conn *conn = to_hci_conn(dev);
- return sprintf(buf, "%pMR\n", &conn->dst);
-}
-
-#define LINK_ATTR(_name, _mode, _show, _store) \
-struct device_attribute link_attr_##_name = __ATTR(_name, _mode, _show, _store)
-
-static LINK_ATTR(type, S_IRUGO, show_link_type, NULL);
-static LINK_ATTR(address, S_IRUGO, show_link_address, NULL);
-
-static struct attribute *bt_link_attrs[] = {
- &link_attr_type.attr,
- &link_attr_address.attr,
- NULL
-};
-
-ATTRIBUTE_GROUPS(bt_link);
-
static void bt_link_release(struct device *dev)
{
struct hci_conn *conn = to_hci_conn(dev);
@@ -59,7 +15,6 @@ static void bt_link_release(struct device *dev)
static struct device_type bt_link = {
.name = "link",
- .groups = bt_link_groups,
.release = bt_link_release,
};
@@ -124,59 +79,6 @@ void hci_conn_del_sysfs(struct hci_conn *conn)
hci_dev_put(hdev);
}
-static inline char *host_typetostr(int type)
-{
- switch (type) {
- case HCI_BREDR:
- return "BR/EDR";
- case HCI_AMP:
- return "AMP";
- default:
- return "UNKNOWN";
- }
-}
-
-static ssize_t show_type(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = to_hci_dev(dev);
- return sprintf(buf, "%s\n", host_typetostr(hdev->dev_type));
-}
-
-static ssize_t show_name(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = to_hci_dev(dev);
- char name[HCI_MAX_NAME_LENGTH + 1];
- int i;
-
- for (i = 0; i < HCI_MAX_NAME_LENGTH; i++)
- name[i] = hdev->dev_name[i];
-
- name[HCI_MAX_NAME_LENGTH] = '\0';
- return sprintf(buf, "%s\n", name);
-}
-
-static ssize_t show_address(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = to_hci_dev(dev);
- return sprintf(buf, "%pMR\n", &hdev->bdaddr);
-}
-
-static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
-static DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
-static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
-
-static struct attribute *bt_host_attrs[] = {
- &dev_attr_type.attr,
- &dev_attr_name.attr,
- &dev_attr_address.attr,
- NULL
-};
-
-ATTRIBUTE_GROUPS(bt_host);
-
static void bt_host_release(struct device *dev)
{
struct hci_dev *hdev = to_hci_dev(dev);
@@ -186,7 +88,6 @@ static void bt_host_release(struct device *dev)
static struct device_type bt_host = {
.name = "host",
- .groups = bt_host_groups,
.release = bt_host_release,
};
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index eb4f5f24c..d4cad29b0 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -32,6 +32,7 @@
#include <linux/debugfs.h>
#include <linux/crc16.h>
+#include <linux/filter.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -5835,6 +5836,9 @@ static int l2cap_reassemble_sdu(struct l2cap_chan *chan, struct sk_buff *skb,
if (chan->sdu)
break;
+ if (!pskb_may_pull(skb, L2CAP_SDULEN_SIZE))
+ break;
+
chan->sdu_len = get_unaligned_le16(skb->data);
skb_pull(skb, L2CAP_SDULEN_SIZE);
@@ -6610,6 +6614,10 @@ static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
goto drop;
}
+ if ((chan->mode == L2CAP_MODE_ERTM ||
+ chan->mode == L2CAP_MODE_STREAMING) && sk_filter(chan->data, skb))
+ goto drop;
+
if (!control->sframe) {
int err;
@@ -7468,7 +7476,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
int len;
/* For AMP controller do not create l2cap conn */
- if (!conn && hcon->hdev->dev_type != HCI_BREDR)
+ if (!conn && hcon->hdev->dev_type != HCI_PRIMARY)
goto drop;
if (!conn)
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 1842141ba..a8ba75273 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1019,7 +1019,7 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
goto done;
if (pi->rx_busy_skb) {
- if (!sock_queue_rcv_skb(sk, pi->rx_busy_skb))
+ if (!__sock_queue_rcv_skb(sk, pi->rx_busy_skb))
pi->rx_busy_skb = NULL;
else
goto done;
@@ -1270,7 +1270,17 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
goto done;
}
- err = sock_queue_rcv_skb(sk, skb);
+ if (chan->mode != L2CAP_MODE_ERTM &&
+ chan->mode != L2CAP_MODE_STREAMING) {
+ /* Even if no filter is attached, we could potentially
+ * get errors from security modules, etc.
+ */
+ err = sk_filter(sk, skb);
+ if (err)
+ goto done;
+ }
+
+ err = __sock_queue_rcv_skb(sk, skb);
/* For ERTM, handle one skb that doesn't fit into the recv
* buffer. This is important to do because the data frames
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 9e4b93158..7639290b6 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
#include "mgmt_util.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 12
+#define MGMT_REVISION 13
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -359,7 +359,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
count = 0;
list_for_each_entry(d, &hci_dev_list, list) {
- if (d->dev_type == HCI_BREDR &&
+ if (d->dev_type == HCI_PRIMARY &&
!hci_dev_test_flag(d, HCI_UNCONFIGURED))
count++;
}
@@ -384,7 +384,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
continue;
- if (d->dev_type == HCI_BREDR &&
+ if (d->dev_type == HCI_PRIMARY &&
!hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
rp->index[count++] = cpu_to_le16(d->id);
BT_DBG("Added hci%u", d->id);
@@ -419,7 +419,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
count = 0;
list_for_each_entry(d, &hci_dev_list, list) {
- if (d->dev_type == HCI_BREDR &&
+ if (d->dev_type == HCI_PRIMARY &&
hci_dev_test_flag(d, HCI_UNCONFIGURED))
count++;
}
@@ -444,7 +444,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
continue;
- if (d->dev_type == HCI_BREDR &&
+ if (d->dev_type == HCI_PRIMARY &&
hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
rp->index[count++] = cpu_to_le16(d->id);
BT_DBG("Added hci%u", d->id);
@@ -479,7 +479,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
count = 0;
list_for_each_entry(d, &hci_dev_list, list) {
- if (d->dev_type == HCI_BREDR || d->dev_type == HCI_AMP)
+ if (d->dev_type == HCI_PRIMARY || d->dev_type == HCI_AMP)
count++;
}
@@ -503,7 +503,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
continue;
- if (d->dev_type == HCI_BREDR) {
+ if (d->dev_type == HCI_PRIMARY) {
if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
rp->entry[count].type = 0x01;
else
@@ -6366,7 +6366,7 @@ void mgmt_index_added(struct hci_dev *hdev)
return;
switch (hdev->dev_type) {
- case HCI_BREDR:
+ case HCI_PRIMARY:
if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev,
NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS);
@@ -6399,7 +6399,7 @@ void mgmt_index_removed(struct hci_dev *hdev)
return;
switch (hdev->dev_type) {
- case HCI_BREDR:
+ case HCI_PRIMARY:
mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 50976a648..4c1a16a96 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -22,9 +22,9 @@
#include <linux/debugfs.h>
#include <linux/scatterlist.h>
+#include <linux/crypto.h>
#include <crypto/b128ops.h>
#include <crypto/hash.h>
-#include <crypto/skcipher.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -88,7 +88,7 @@ struct smp_dev {
u8 min_key_size;
u8 max_key_size;
- struct crypto_skcipher *tfm_aes;
+ struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
};
@@ -127,7 +127,7 @@ struct smp_chan {
u8 dhkey[32];
u8 mackey[16];
- struct crypto_skcipher *tfm_aes;
+ struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
};
@@ -361,10 +361,8 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16],
* s1 and ah.
*/
-static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r)
+static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- struct scatterlist sg;
uint8_t tmp[16], data[16];
int err;
@@ -378,7 +376,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r)
/* The most significant octet of key corresponds to k[0] */
swap_buf(k, tmp, 16);
- err = crypto_skcipher_setkey(tfm, tmp, 16);
+ err = crypto_cipher_setkey(tfm, tmp, 16);
if (err) {
BT_ERR("cipher setkey failed: %d", err);
return err;
@@ -387,16 +385,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r)
/* Most significant octet of plaintextData corresponds to data[0] */
swap_buf(r, data, 16);
- sg_init_one(&sg, data, 16);
-
- skcipher_request_set_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg, &sg, 16, NULL);
-
- err = crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
- if (err)
- BT_ERR("Encrypt data error %d", err);
+ crypto_cipher_encrypt_one(tfm, data, data);
/* Most significant octet of encryptedData corresponds to data[0] */
swap_buf(data, r, 16);
@@ -406,7 +395,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r)
return err;
}
-static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16],
+static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16],
const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat,
const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16])
{
@@ -455,7 +444,7 @@ static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16],
return err;
}
-static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16],
+static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16],
const u8 r1[16], const u8 r2[16], u8 _r[16])
{
int err;
@@ -471,7 +460,7 @@ static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16],
return err;
}
-static int smp_ah(struct crypto_skcipher *tfm, const u8 irk[16],
+static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16],
const u8 r[3], u8 res[3])
{
u8 _res[16];
@@ -759,7 +748,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
kzfree(smp->slave_csrk);
kzfree(smp->link_key);
- crypto_free_skcipher(smp->tfm_aes);
+ crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
/* Ensure that we don't leave any debug key around if debug key
@@ -1359,9 +1348,9 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
if (!smp)
return NULL;
- smp->tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(smp->tfm_aes)) {
- BT_ERR("Unable to create ECB crypto context");
+ BT_ERR("Unable to create AES crypto context");
kzfree(smp);
return NULL;
}
@@ -1369,7 +1358,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(smp->tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_skcipher(smp->tfm_aes);
+ crypto_free_cipher(smp->tfm_aes);
kzfree(smp);
return NULL;
}
@@ -3120,7 +3109,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
{
struct l2cap_chan *chan;
struct smp_dev *smp;
- struct crypto_skcipher *tfm_aes;
+ struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
if (cid == L2CAP_CID_SMP_BREDR) {
@@ -3132,9 +3121,9 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
if (!smp)
return ERR_PTR(-ENOMEM);
- tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_aes)) {
- BT_ERR("Unable to create ECB crypto context");
+ BT_ERR("Unable to create AES crypto context");
kzfree(smp);
return ERR_CAST(tfm_aes);
}
@@ -3142,7 +3131,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_skcipher(tfm_aes);
+ crypto_free_cipher(tfm_aes);
kzfree(smp);
return ERR_CAST(tfm_cmac);
}
@@ -3156,7 +3145,7 @@ create_chan:
chan = l2cap_chan_create();
if (!chan) {
if (smp) {
- crypto_free_skcipher(smp->tfm_aes);
+ crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
kzfree(smp);
}
@@ -3203,7 +3192,7 @@ static void smp_del_chan(struct l2cap_chan *chan)
smp = chan->data;
if (smp) {
chan->data = NULL;
- crypto_free_skcipher(smp->tfm_aes);
+ crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
kzfree(smp);
}
@@ -3440,7 +3429,7 @@ void smp_unregister(struct hci_dev *hdev)
#if IS_ENABLED(CONFIG_BT_SELFTEST_SMP)
-static int __init test_ah(struct crypto_skcipher *tfm_aes)
+static int __init test_ah(struct crypto_cipher *tfm_aes)
{
const u8 irk[16] = {
0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
@@ -3460,7 +3449,7 @@ static int __init test_ah(struct crypto_skcipher *tfm_aes)
return 0;
}
-static int __init test_c1(struct crypto_skcipher *tfm_aes)
+static int __init test_c1(struct crypto_cipher *tfm_aes)
{
const u8 k[16] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3490,7 +3479,7 @@ static int __init test_c1(struct crypto_skcipher *tfm_aes)
return 0;
}
-static int __init test_s1(struct crypto_skcipher *tfm_aes)
+static int __init test_s1(struct crypto_cipher *tfm_aes)
{
const u8 k[16] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3686,7 +3675,7 @@ static const struct file_operations test_smp_fops = {
.llseek = default_llseek,
};
-static int __init run_selftests(struct crypto_skcipher *tfm_aes,
+static int __init run_selftests(struct crypto_cipher *tfm_aes,
struct crypto_shash *tfm_cmac)
{
ktime_t calltime, delta, rettime;
@@ -3764,27 +3753,27 @@ done:
int __init bt_selftest_smp(void)
{
- struct crypto_skcipher *tfm_aes;
+ struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
int err;
- tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_aes)) {
- BT_ERR("Unable to create ECB crypto context");
+ BT_ERR("Unable to create AES crypto context");
return PTR_ERR(tfm_aes);
}
tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_skcipher(tfm_aes);
+ crypto_free_cipher(tfm_aes);
return PTR_ERR(tfm_cmac);
}
err = run_selftests(tfm_aes, tfm_cmac);
crypto_free_shash(tfm_cmac);
- crypto_free_skcipher(tfm_aes);
+ crypto_free_cipher(tfm_aes);
return err;
}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 2c8095a5d..09f26940a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -61,11 +61,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
goto out;
- if (is_broadcast_ether_addr(dest))
- br_flood_deliver(br, skb, false);
- else if (is_multicast_ether_addr(dest)) {
+ if (is_broadcast_ether_addr(dest)) {
+ br_flood(br, skb, false, false, true);
+ } else if (is_multicast_ether_addr(dest)) {
if (unlikely(netpoll_tx_running(dev))) {
- br_flood_deliver(br, skb, false);
+ br_flood(br, skb, false, false, true);
goto out;
}
if (br_multicast_rcv(br, NULL, skb, vid)) {
@@ -76,14 +76,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb)))
- br_multicast_deliver(mdst, skb);
+ br_multicast_flood(mdst, skb, false, true);
else
- br_flood_deliver(br, skb, false);
- } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL)
- br_deliver(dst->dst, skb);
- else
- br_flood_deliver(br, skb, true);
-
+ br_flood(br, skb, false, false, true);
+ } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) {
+ br_forward(dst->dst, skb, false, true);
+ } else {
+ br_flood(br, skb, true, false, true);
+ }
out:
rcu_read_unlock();
return NETDEV_TX_OK;
@@ -104,8 +104,16 @@ static int br_dev_init(struct net_device *dev)
return -ENOMEM;
err = br_vlan_init(br);
- if (err)
+ if (err) {
free_percpu(br->stats);
+ return err;
+ }
+
+ err = br_multicast_init_stats(br);
+ if (err) {
+ free_percpu(br->stats);
+ br_vlan_flush(br);
+ }
br_set_lockdep_class(dev);
return err;
@@ -341,6 +349,8 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_add_slave = br_add_slave,
.ndo_del_slave = br_del_slave,
.ndo_fix_features = br_fix_features,
+ .ndo_neigh_construct = netdev_default_l2upper_neigh_construct,
+ .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy,
.ndo_fdb_add = br_fdb_add,
.ndo_fdb_del = br_fdb_delete,
.ndo_fdb_dump = br_fdb_dump,
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index c18080ad4..cd620fab4 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -267,7 +267,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
/* If old entry was unassociated with any port, then delete it. */
f = __br_fdb_get(br, br->dev->dev_addr, 0);
- if (f && f->is_local && !f->dst)
+ if (f && f->is_local && !f->dst && !f->added_by_user)
fdb_delete_local(br, NULL, f);
fdb_insert(br, NULL, newaddr, 0);
@@ -282,7 +282,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
if (!br_vlan_should_use(v))
continue;
f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
- if (f && f->is_local && !f->dst)
+ if (f && f->is_local && !f->dst && !f->added_by_user)
fdb_delete_local(br, NULL, f);
fdb_insert(br, NULL, newaddr, v->vid);
}
@@ -764,20 +764,25 @@ out:
}
/* Update (create or replace) forwarding database entry */
-static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
- __u16 state, __u16 flags, __u16 vid)
+static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
+ const __u8 *addr, __u16 state, __u16 flags, __u16 vid)
{
- struct net_bridge *br = source->br;
struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *fdb;
bool modified = false;
/* If the port cannot learn allow only local and static entries */
- if (!(state & NUD_PERMANENT) && !(state & NUD_NOARP) &&
+ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) &&
!(source->state == BR_STATE_LEARNING ||
source->state == BR_STATE_FORWARDING))
return -EPERM;
+ if (!source && !(state & NUD_PERMANENT)) {
+ pr_info("bridge: RTM_NEWNEIGH %s without NUD_PERMANENT\n",
+ br->dev->name);
+ return -EINVAL;
+ }
+
fdb = fdb_find(head, addr, vid);
if (fdb == NULL) {
if (!(flags & NLM_F_CREATE))
@@ -832,22 +837,28 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
return 0;
}
-static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p,
- const unsigned char *addr, u16 nlh_flags, u16 vid)
+static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
+ struct net_bridge_port *p, const unsigned char *addr,
+ u16 nlh_flags, u16 vid)
{
int err = 0;
if (ndm->ndm_flags & NTF_USE) {
+ if (!p) {
+ pr_info("bridge: RTM_NEWNEIGH %s with NTF_USE is not supported\n",
+ br->dev->name);
+ return -EINVAL;
+ }
local_bh_disable();
rcu_read_lock();
- br_fdb_update(p->br, p, addr, vid, true);
+ br_fdb_update(br, p, addr, vid, true);
rcu_read_unlock();
local_bh_enable();
} else {
- spin_lock_bh(&p->br->hash_lock);
- err = fdb_add_entry(p, addr, ndm->ndm_state,
+ spin_lock_bh(&br->hash_lock);
+ err = fdb_add_entry(br, p, addr, ndm->ndm_state,
nlh_flags, vid);
- spin_unlock_bh(&p->br->hash_lock);
+ spin_unlock_bh(&br->hash_lock);
}
return err;
@@ -884,6 +895,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
dev->name);
return -EINVAL;
}
+ br = p->br;
vg = nbp_vlan_group(p);
}
@@ -895,15 +907,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
}
/* VID was specified, so use it. */
- if (dev->priv_flags & IFF_EBRIDGE)
- err = br_fdb_insert(br, NULL, addr, vid);
- else
- err = __br_fdb_add(ndm, p, addr, nlh_flags, vid);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid);
} else {
- if (dev->priv_flags & IFF_EBRIDGE)
- err = br_fdb_insert(br, NULL, addr, 0);
- else
- err = __br_fdb_add(ndm, p, addr, nlh_flags, 0);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0);
if (err || !vg || !vg->num_vlans)
goto out;
@@ -914,11 +920,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
list_for_each_entry(v, &vg->vlan_list, vlist) {
if (!br_vlan_should_use(v))
continue;
- if (dev->priv_flags & IFF_EBRIDGE)
- err = br_fdb_insert(br, NULL, addr, v->vid);
- else
- err = __br_fdb_add(ndm, p, addr, nlh_flags,
- v->vid);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid);
if (err)
goto out;
}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index f47759f05..63a83d8d7 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -21,11 +21,6 @@
#include <linux/netfilter_bridge.h>
#include "br_private.h"
-static int deliver_clone(const struct net_bridge_port *prev,
- struct sk_buff *skb,
- void (*__packet_hook)(const struct net_bridge_port *p,
- struct sk_buff *skb));
-
/* Don't forward packets to originating port or forwarding disabled */
static inline int should_deliver(const struct net_bridge_port *p,
const struct sk_buff *skb)
@@ -75,105 +70,92 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(br_forward_finish);
-static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+static void __br_forward(const struct net_bridge_port *to,
+ struct sk_buff *skb, bool local_orig)
{
struct net_bridge_vlan_group *vg;
+ struct net_device *indev;
+ struct net *net;
+ int br_hook;
vg = nbp_vlan_group_rcu(to);
skb = br_handle_vlan(to->br, vg, skb);
if (!skb)
return;
+ indev = skb->dev;
skb->dev = to->dev;
-
- if (unlikely(netpoll_tx_running(to->br->dev))) {
- if (!is_skb_forwardable(skb->dev, skb))
+ if (!local_orig) {
+ if (skb_warn_if_lro(skb)) {
kfree_skb(skb);
- else {
- skb_push(skb, ETH_HLEN);
- br_netpoll_send_skb(to, skb);
+ return;
}
- return;
+ br_hook = NF_BR_FORWARD;
+ skb_forward_csum(skb);
+ net = dev_net(indev);
+ } else {
+ if (unlikely(netpoll_tx_running(to->br->dev))) {
+ if (!is_skb_forwardable(skb->dev, skb)) {
+ kfree_skb(skb);
+ } else {
+ skb_push(skb, ETH_HLEN);
+ br_netpoll_send_skb(to, skb);
+ }
+ return;
+ }
+ br_hook = NF_BR_LOCAL_OUT;
+ net = dev_net(skb->dev);
+ indev = NULL;
}
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
- dev_net(skb->dev), NULL, skb,NULL, skb->dev,
+ NF_HOOK(NFPROTO_BRIDGE, br_hook,
+ net, NULL, skb, indev, skb->dev,
br_forward_finish);
}
-static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
+static int deliver_clone(const struct net_bridge_port *prev,
+ struct sk_buff *skb, bool local_orig)
{
- struct net_bridge_vlan_group *vg;
- struct net_device *indev;
-
- if (skb_warn_if_lro(skb)) {
- kfree_skb(skb);
- return;
- }
-
- vg = nbp_vlan_group_rcu(to);
- skb = br_handle_vlan(to->br, vg, skb);
- if (!skb)
- return;
-
- indev = skb->dev;
- skb->dev = to->dev;
- skb_forward_csum(skb);
-
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD,
- dev_net(indev), NULL, skb, indev, skb->dev,
- br_forward_finish);
-}
+ struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
-/* called with rcu_read_lock */
-void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
-{
- if (to && should_deliver(to, skb)) {
- __br_deliver(to, skb);
- return;
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb) {
+ dev->stats.tx_dropped++;
+ return -ENOMEM;
}
- kfree_skb(skb);
+ __br_forward(prev, skb, local_orig);
+ return 0;
}
-EXPORT_SYMBOL_GPL(br_deliver);
-/* called with rcu_read_lock */
-void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0)
+/**
+ * br_forward - forward a packet to a specific port
+ * @to: destination port
+ * @skb: packet being forwarded
+ * @local_rcv: packet will be received locally after forwarding
+ * @local_orig: packet is locally originated
+ *
+ * Should be called with rcu_read_lock.
+ */
+void br_forward(const struct net_bridge_port *to,
+ struct sk_buff *skb, bool local_rcv, bool local_orig)
{
if (to && should_deliver(to, skb)) {
- if (skb0)
- deliver_clone(to, skb, __br_forward);
+ if (local_rcv)
+ deliver_clone(to, skb, local_orig);
else
- __br_forward(to, skb);
+ __br_forward(to, skb, local_orig);
return;
}
- if (!skb0)
+ if (!local_rcv)
kfree_skb(skb);
}
-
-static int deliver_clone(const struct net_bridge_port *prev,
- struct sk_buff *skb,
- void (*__packet_hook)(const struct net_bridge_port *p,
- struct sk_buff *skb))
-{
- struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
-
- skb = skb_clone(skb, GFP_ATOMIC);
- if (!skb) {
- dev->stats.tx_dropped++;
- return -ENOMEM;
- }
-
- __packet_hook(prev, skb);
- return 0;
-}
+EXPORT_SYMBOL_GPL(br_forward);
static struct net_bridge_port *maybe_deliver(
struct net_bridge_port *prev, struct net_bridge_port *p,
- struct sk_buff *skb,
- void (*__packet_hook)(const struct net_bridge_port *p,
- struct sk_buff *skb))
+ struct sk_buff *skb, bool local_orig)
{
int err;
@@ -183,7 +165,7 @@ static struct net_bridge_port *maybe_deliver(
if (!prev)
goto out;
- err = deliver_clone(prev, skb, __packet_hook);
+ err = deliver_clone(prev, skb, local_orig);
if (err)
return ERR_PTR(err);
@@ -191,17 +173,13 @@ out:
return p;
}
-/* called under bridge lock */
-static void br_flood(struct net_bridge *br, struct sk_buff *skb,
- struct sk_buff *skb0,
- void (*__packet_hook)(const struct net_bridge_port *p,
- struct sk_buff *skb),
- bool unicast)
+/* called under rcu_read_lock */
+void br_flood(struct net_bridge *br, struct sk_buff *skb,
+ bool unicast, bool local_rcv, bool local_orig)
{
+ u8 igmp_type = br_multicast_igmp_type(skb);
+ struct net_bridge_port *prev = NULL;
struct net_bridge_port *p;
- struct net_bridge_port *prev;
-
- prev = NULL;
list_for_each_entry_rcu(p, &br->port_list, list) {
/* Do not flood unicast traffic to ports that turn it off */
@@ -215,48 +193,36 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
BR_INPUT_SKB_CB(skb)->proxyarp_replied)
continue;
- prev = maybe_deliver(prev, p, skb, __packet_hook);
+ prev = maybe_deliver(prev, p, skb, local_orig);
if (IS_ERR(prev))
goto out;
+ if (prev == p)
+ br_multicast_count(p->br, p, skb, igmp_type,
+ BR_MCAST_DIR_TX);
}
if (!prev)
goto out;
- if (skb0)
- deliver_clone(prev, skb, __packet_hook);
+ if (local_rcv)
+ deliver_clone(prev, skb, local_orig);
else
- __packet_hook(prev, skb);
+ __br_forward(prev, skb, local_orig);
return;
out:
- if (!skb0)
+ if (!local_rcv)
kfree_skb(skb);
}
-
-/* called with rcu_read_lock */
-void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast)
-{
- br_flood(br, skb, NULL, __br_deliver, unicast);
-}
-
-/* called under bridge lock */
-void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
- struct sk_buff *skb2, bool unicast)
-{
- br_flood(br, skb, skb2, __br_forward, unicast);
-}
-
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
/* called with rcu_read_lock */
-static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
- struct sk_buff *skb, struct sk_buff *skb0,
- void (*__packet_hook)(
- const struct net_bridge_port *p,
- struct sk_buff *skb))
+void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb,
+ bool local_rcv, bool local_orig)
{
struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+ u8 igmp_type = br_multicast_igmp_type(skb);
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *prev = NULL;
struct net_bridge_port_group *p;
@@ -274,9 +240,12 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
port = (unsigned long)lport > (unsigned long)rport ?
lport : rport;
- prev = maybe_deliver(prev, port, skb, __packet_hook);
+ prev = maybe_deliver(prev, port, skb, local_orig);
if (IS_ERR(prev))
goto out;
+ if (prev == port)
+ br_multicast_count(port->br, port, skb, igmp_type,
+ BR_MCAST_DIR_TX);
if ((unsigned long)lport >= (unsigned long)port)
p = rcu_dereference(p->next);
@@ -287,28 +256,14 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
if (!prev)
goto out;
- if (skb0)
- deliver_clone(prev, skb, __packet_hook);
+ if (local_rcv)
+ deliver_clone(prev, skb, local_orig);
else
- __packet_hook(prev, skb);
+ __br_forward(prev, skb, local_orig);
return;
out:
- if (!skb0)
+ if (!local_rcv)
kfree_skb(skb);
}
-
-/* called with rcu_read_lock */
-void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
- struct sk_buff *skb)
-{
- br_multicast_flood(mdst, skb, NULL, __br_deliver);
-}
-
-/* called with rcu_read_lock */
-void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
- struct sk_buff *skb, struct sk_buff *skb2)
-{
- br_multicast_flood(mdst, skb, skb2, __br_forward);
-}
#endif
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8217aecf0..f2fede05d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -345,8 +345,8 @@ static int find_portno(struct net_bridge *br)
static struct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
- int index;
struct net_bridge_port *p;
+ int index, err;
index = find_portno(br);
if (index < 0)
@@ -366,7 +366,12 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
- br_multicast_add_port(p);
+ err = br_multicast_add_port(p);
+ if (err) {
+ dev_put(dev);
+ kfree(p);
+ p = ERR_PTR(err);
+ }
return p;
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 28d5ec269..abe11f085 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -60,6 +60,9 @@ static int br_pass_frame_up(struct sk_buff *skb)
skb = br_handle_vlan(br, vg, skb);
if (!skb)
return NET_RX_DROP;
+ /* update the multicast stats if the packet is IGMP/MLD */
+ br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb),
+ BR_MCAST_DIR_TX);
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
dev_net(indev), NULL, skb, indev, NULL,
@@ -77,13 +80,10 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
- if (dev->flags & IFF_NOARP)
+ if ((dev->flags & IFF_NOARP) ||
+ !pskb_may_pull(skb, arp_hdr_len(dev)))
return;
- if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
- dev->stats.tx_dropped++;
- return;
- }
parp = arp_hdr(skb);
if (parp->ar_pro != htons(ETH_P_IP) ||
@@ -128,13 +128,12 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- const unsigned char *dest = eth_hdr(skb)->h_dest;
+ bool local_rcv = false, mcast_hit = false, unicast = true;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
- struct net_bridge *br;
- struct net_bridge_fdb_entry *dst;
+ const unsigned char *dest = eth_hdr(skb)->h_dest;
+ struct net_bridge_fdb_entry *dst = NULL;
struct net_bridge_mdb_entry *mdst;
- struct sk_buff *skb2;
- bool unicast = true;
+ struct net_bridge *br;
u16 vid = 0;
if (!p || p->state == BR_STATE_DISABLED)
@@ -157,53 +156,46 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
- /* The packet skb2 goes to the local host (NULL to skip). */
- skb2 = NULL;
-
- if (br->dev->flags & IFF_PROMISC)
- skb2 = skb;
-
- dst = NULL;
+ local_rcv = !!(br->dev->flags & IFF_PROMISC);
if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
br_do_proxy_arp(skb, br, vid, p);
if (is_broadcast_ether_addr(dest)) {
- skb2 = skb;
+ local_rcv = true;
unicast = false;
} else if (is_multicast_ether_addr(dest)) {
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
if ((mdst && mdst->mglist) ||
- br_multicast_is_router(br))
- skb2 = skb;
- br_multicast_forward(mdst, skb, skb2);
- skb = NULL;
- if (!skb2)
- goto out;
- } else
- skb2 = skb;
-
+ br_multicast_is_router(br)) {
+ local_rcv = true;
+ br->dev->stats.multicast++;
+ }
+ mcast_hit = true;
+ } else {
+ local_rcv = true;
+ br->dev->stats.multicast++;
+ }
unicast = false;
- br->dev->stats.multicast++;
- } else if ((dst = __br_fdb_get(br, dest, vid)) &&
- dst->is_local) {
- skb2 = skb;
+ } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) {
/* Do not forward the packet since it's local. */
- skb = NULL;
+ return br_pass_frame_up(skb);
}
- if (skb) {
- if (dst) {
- dst->used = jiffies;
- br_forward(dst->dst, skb, skb2);
- } else
- br_flood_forward(br, skb, skb2, unicast);
+ if (dst) {
+ dst->used = jiffies;
+ br_forward(dst->dst, skb, local_rcv, false);
+ } else {
+ if (!mcast_hit)
+ br_flood(br, skb, unicast, local_rcv, false);
+ else
+ br_multicast_flood(mdst, skb, local_rcv, false);
}
- if (skb2)
- return br_pass_frame_up(skb2);
+ if (local_rcv)
+ return br_pass_frame_up(skb);
out:
return 0;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d3abdaefe..c5fea9393 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -361,7 +361,8 @@ out:
}
static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
- __be32 group)
+ __be32 group,
+ u8 *igmp_type)
{
struct sk_buff *skb;
struct igmphdr *ih;
@@ -411,6 +412,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
skb_set_transport_header(skb, skb->len);
ih = igmp_hdr(skb);
+ *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY;
ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
ih->code = (group ? br->multicast_last_member_interval :
br->multicast_query_response_interval) /
@@ -428,7 +430,8 @@ out:
#if IS_ENABLED(CONFIG_IPV6)
static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
- const struct in6_addr *group)
+ const struct in6_addr *grp,
+ u8 *igmp_type)
{
struct sk_buff *skb;
struct ipv6hdr *ip6h;
@@ -487,16 +490,17 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
skb_set_transport_header(skb, skb->len);
mldq = (struct mld_msg *) icmp6_hdr(skb);
- interval = ipv6_addr_any(group) ?
+ interval = ipv6_addr_any(grp) ?
br->multicast_query_response_interval :
br->multicast_last_member_interval;
+ *igmp_type = ICMPV6_MGM_QUERY;
mldq->mld_type = ICMPV6_MGM_QUERY;
mldq->mld_code = 0;
mldq->mld_cksum = 0;
mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
mldq->mld_reserved = 0;
- mldq->mld_mca = *group;
+ mldq->mld_mca = *grp;
/* checksum */
mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -513,14 +517,16 @@ out:
#endif
static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
- struct br_ip *addr)
+ struct br_ip *addr,
+ u8 *igmp_type)
{
switch (addr->proto) {
case htons(ETH_P_IP):
- return br_ip4_multicast_alloc_query(br, addr->u.ip4);
+ return br_ip4_multicast_alloc_query(br, addr->u.ip4, igmp_type);
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- return br_ip6_multicast_alloc_query(br, &addr->u.ip6);
+ return br_ip6_multicast_alloc_query(br, &addr->u.ip6,
+ igmp_type);
#endif
}
return NULL;
@@ -829,18 +835,23 @@ static void __br_multicast_send_query(struct net_bridge *br,
struct br_ip *ip)
{
struct sk_buff *skb;
+ u8 igmp_type;
- skb = br_multicast_alloc_query(br, ip);
+ skb = br_multicast_alloc_query(br, ip, &igmp_type);
if (!skb)
return;
if (port) {
skb->dev = port->dev;
+ br_multicast_count(br, port, skb, igmp_type,
+ BR_MCAST_DIR_TX);
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
dev_net(port->dev), NULL, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
} else {
br_multicast_select_own_querier(br, ip, skb);
+ br_multicast_count(br, port, skb, igmp_type,
+ BR_MCAST_DIR_RX);
netif_rx(skb);
}
}
@@ -918,7 +929,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
}
#endif
-void br_multicast_add_port(struct net_bridge_port *port)
+int br_multicast_add_port(struct net_bridge_port *port)
{
port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
@@ -930,6 +941,11 @@ void br_multicast_add_port(struct net_bridge_port *port)
setup_timer(&port->ip6_own_query.timer,
br_ip6_multicast_port_query_expired, (unsigned long)port);
#endif
+ port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
+ if (!port->mcast_stats)
+ return -ENOMEM;
+
+ return 0;
}
void br_multicast_del_port(struct net_bridge_port *port)
@@ -944,6 +960,7 @@ void br_multicast_del_port(struct net_bridge_port *port)
br_multicast_del_pg(br, pg);
spin_unlock_bh(&br->multicast_lock);
del_timer_sync(&port->multicast_router_timer);
+ free_percpu(port->mcast_stats);
}
static void br_multicast_enable(struct bridge_mcast_own_query *query)
@@ -1583,6 +1600,39 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
}
#endif
+static void br_multicast_err_count(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ __be16 proto)
+{
+ struct bridge_mcast_stats __percpu *stats;
+ struct bridge_mcast_stats *pstats;
+
+ if (!br->multicast_stats_enabled)
+ return;
+
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ pstats = this_cpu_ptr(stats);
+
+ u64_stats_update_begin(&pstats->syncp);
+ switch (proto) {
+ case htons(ETH_P_IP):
+ pstats->mstats.igmp_parse_errors++;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ pstats->mstats.mld_parse_errors++;
+ break;
+#endif
+ }
+ u64_stats_update_end(&pstats->syncp);
+}
+
static int br_multicast_ipv4_rcv(struct net_bridge *br,
struct net_bridge_port *port,
struct sk_buff *skb,
@@ -1599,11 +1649,12 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
return 0;
} else if (err < 0) {
+ br_multicast_err_count(br, port, skb->protocol);
return err;
}
- BR_INPUT_SKB_CB(skb)->igmp = 1;
ih = igmp_hdr(skb);
+ BR_INPUT_SKB_CB(skb)->igmp = ih->type;
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_REPORT:
@@ -1625,6 +1676,9 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
if (skb_trimmed && skb_trimmed != skb)
kfree_skb(skb_trimmed);
+ br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp,
+ BR_MCAST_DIR_RX);
+
return err;
}
@@ -1645,11 +1699,12 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
return 0;
} else if (err < 0) {
+ br_multicast_err_count(br, port, skb->protocol);
return err;
}
- BR_INPUT_SKB_CB(skb)->igmp = 1;
mld = (struct mld_msg *)skb_transport_header(skb);
+ BR_INPUT_SKB_CB(skb)->igmp = mld->mld_type;
switch (mld->mld_type) {
case ICMPV6_MGM_REPORT:
@@ -1670,6 +1725,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
if (skb_trimmed && skb_trimmed != skb)
kfree_skb(skb_trimmed);
+ br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp,
+ BR_MCAST_DIR_RX);
+
return err;
}
#endif
@@ -1677,6 +1735,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
struct sk_buff *skb, u16 vid)
{
+ int ret = 0;
+
BR_INPUT_SKB_CB(skb)->igmp = 0;
BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
@@ -1685,14 +1745,16 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
switch (skb->protocol) {
case htons(ETH_P_IP):
- return br_multicast_ipv4_rcv(br, port, skb, vid);
+ ret = br_multicast_ipv4_rcv(br, port, skb, vid);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- return br_multicast_ipv6_rcv(br, port, skb, vid);
+ ret = br_multicast_ipv6_rcv(br, port, skb, vid);
+ break;
#endif
}
- return 0;
+ return ret;
}
static void br_multicast_query_expired(struct net_bridge *br,
@@ -1831,6 +1893,8 @@ void br_multicast_dev_del(struct net_bridge *br)
out:
spin_unlock_bh(&br->multicast_lock);
+
+ free_percpu(br->mcast_stats);
}
int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -2185,3 +2249,154 @@ unlock:
return ret;
}
EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent);
+
+static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats,
+ const struct sk_buff *skb, u8 type, u8 dir)
+{
+ struct bridge_mcast_stats *pstats = this_cpu_ptr(stats);
+ __be16 proto = skb->protocol;
+ unsigned int t_len;
+
+ u64_stats_update_begin(&pstats->syncp);
+ switch (proto) {
+ case htons(ETH_P_IP):
+ t_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+ switch (type) {
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v1reports[dir]++;
+ break;
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v2reports[dir]++;
+ break;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v3reports[dir]++;
+ break;
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ if (t_len != sizeof(struct igmphdr)) {
+ pstats->mstats.igmp_v3queries[dir]++;
+ } else {
+ unsigned int offset = skb_transport_offset(skb);
+ struct igmphdr *ih, _ihdr;
+
+ ih = skb_header_pointer(skb, offset,
+ sizeof(_ihdr), &_ihdr);
+ if (!ih)
+ break;
+ if (!ih->code)
+ pstats->mstats.igmp_v1queries[dir]++;
+ else
+ pstats->mstats.igmp_v2queries[dir]++;
+ }
+ break;
+ case IGMP_HOST_LEAVE_MESSAGE:
+ pstats->mstats.igmp_leaves[dir]++;
+ break;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ t_len = ntohs(ipv6_hdr(skb)->payload_len) +
+ sizeof(struct ipv6hdr);
+ t_len -= skb_network_header_len(skb);
+ switch (type) {
+ case ICMPV6_MGM_REPORT:
+ pstats->mstats.mld_v1reports[dir]++;
+ break;
+ case ICMPV6_MLD2_REPORT:
+ pstats->mstats.mld_v2reports[dir]++;
+ break;
+ case ICMPV6_MGM_QUERY:
+ if (t_len != sizeof(struct mld_msg))
+ pstats->mstats.mld_v2queries[dir]++;
+ else
+ pstats->mstats.mld_v1queries[dir]++;
+ break;
+ case ICMPV6_MGM_REDUCTION:
+ pstats->mstats.mld_leaves[dir]++;
+ break;
+ }
+ break;
+#endif /* CONFIG_IPV6 */
+ }
+ u64_stats_update_end(&pstats->syncp);
+}
+
+void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
+ const struct sk_buff *skb, u8 type, u8 dir)
+{
+ struct bridge_mcast_stats __percpu *stats;
+
+ /* if multicast_disabled is true then igmp type can't be set */
+ if (!type || !br->multicast_stats_enabled)
+ return;
+
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ br_mcast_stats_add(stats, skb, type, dir);
+}
+
+int br_multicast_init_stats(struct net_bridge *br)
+{
+ br->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
+ if (!br->mcast_stats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void mcast_stats_add_dir(u64 *dst, u64 *src)
+{
+ dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
+ dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX];
+}
+
+void br_multicast_get_stats(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct br_mcast_stats *dest)
+{
+ struct bridge_mcast_stats __percpu *stats;
+ struct br_mcast_stats tdst;
+ int i;
+
+ memset(dest, 0, sizeof(*dest));
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ memset(&tdst, 0, sizeof(tdst));
+ for_each_possible_cpu(i) {
+ struct bridge_mcast_stats *cpu_stats = per_cpu_ptr(stats, i);
+ struct br_mcast_stats temp;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ memcpy(&temp, &cpu_stats->mstats, sizeof(temp));
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+ mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries);
+ mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries);
+ mcast_stats_add_dir(tdst.igmp_v3queries, temp.igmp_v3queries);
+ mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves);
+ mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports);
+ mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports);
+ mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports);
+ tdst.igmp_parse_errors += temp.igmp_parse_errors;
+
+ mcast_stats_add_dir(tdst.mld_v1queries, temp.mld_v1queries);
+ mcast_stats_add_dir(tdst.mld_v2queries, temp.mld_v2queries);
+ mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves);
+ mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports);
+ mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports);
+ tdst.mld_parse_errors += temp.mld_parse_errors;
+ }
+ memcpy(dest, &tdst, sizeof(*dest));
+}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 85e89f693..f2a29e467 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -851,6 +851,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 },
[IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
[IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1055,6 +1056,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
br->multicast_startup_query_interval = clock_t_to_jiffies(val);
}
+
+ if (data[IFLA_BR_MCAST_STATS_ENABLED]) {
+ __u8 mcast_stats;
+
+ mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
+ br->multicast_stats_enabled = !!mcast_stats;
+ }
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (data[IFLA_BR_NF_CALL_IPTABLES]) {
@@ -1110,6 +1118,7 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */
@@ -1187,6 +1196,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
br->multicast_query_use_ifaddr) ||
nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
+ br->multicast_stats_enabled) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
br->hash_elasticity) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
@@ -1234,7 +1245,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
return 0;
}
-static size_t br_get_linkxstats_size(const struct net_device *dev)
+static size_t bridge_get_linkxstats_size(const struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_vlan_group *vg;
@@ -1242,53 +1253,88 @@ static size_t br_get_linkxstats_size(const struct net_device *dev)
int numvls = 0;
vg = br_vlan_group(br);
- if (!vg)
- return 0;
-
- /* we need to count all, even placeholder entries */
- list_for_each_entry(v, &vg->vlan_list, vlist)
- numvls++;
+ if (vg) {
+ /* we need to count all, even placeholder entries */
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ numvls++;
+ }
- /* account for the vlans and the link xstats type nest attribute */
return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) +
+ nla_total_size(sizeof(struct br_mcast_stats)) +
nla_total_size(0);
}
-static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
- int *prividx)
+static size_t brport_get_linkxstats_size(const struct net_device *dev)
+{
+ return nla_total_size(sizeof(struct br_mcast_stats)) +
+ nla_total_size(0);
+}
+
+static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
+{
+ size_t retsize = 0;
+
+ switch (attr) {
+ case IFLA_STATS_LINK_XSTATS:
+ retsize = bridge_get_linkxstats_size(dev);
+ break;
+ case IFLA_STATS_LINK_XSTATS_SLAVE:
+ retsize = brport_get_linkxstats_size(dev);
+ break;
+ }
+
+ return retsize;
+}
+
+static int bridge_fill_linkxstats(struct sk_buff *skb,
+ const struct net_device *dev,
+ int *prividx)
{
struct net_bridge *br = netdev_priv(dev);
+ struct nlattr *nla __maybe_unused;
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *v;
struct nlattr *nest;
int vl_idx = 0;
- vg = br_vlan_group(br);
- if (!vg)
- goto out;
nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
if (!nest)
return -EMSGSIZE;
- list_for_each_entry(v, &vg->vlan_list, vlist) {
- struct bridge_vlan_xstats vxi;
- struct br_vlan_stats stats;
- if (++vl_idx < *prividx)
- continue;
- memset(&vxi, 0, sizeof(vxi));
- vxi.vid = v->vid;
- br_vlan_get_stats(v, &stats);
- vxi.rx_bytes = stats.rx_bytes;
- vxi.rx_packets = stats.rx_packets;
- vxi.tx_bytes = stats.tx_bytes;
- vxi.tx_packets = stats.tx_packets;
-
- if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi))
+ vg = br_vlan_group(br);
+ if (vg) {
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ struct bridge_vlan_xstats vxi;
+ struct br_vlan_stats stats;
+
+ if (++vl_idx < *prividx)
+ continue;
+ memset(&vxi, 0, sizeof(vxi));
+ vxi.vid = v->vid;
+ br_vlan_get_stats(v, &stats);
+ vxi.rx_bytes = stats.rx_bytes;
+ vxi.rx_packets = stats.rx_packets;
+ vxi.tx_bytes = stats.tx_bytes;
+ vxi.tx_packets = stats.tx_packets;
+
+ if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi))
+ goto nla_put_failure;
+ }
+ }
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (++vl_idx >= *prividx) {
+ nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST,
+ sizeof(struct br_mcast_stats),
+ BRIDGE_XSTATS_PAD);
+ if (!nla)
goto nla_put_failure;
+ br_multicast_get_stats(br, NULL, nla_data(nla));
}
+#endif
nla_nest_end(skb, nest);
*prividx = 0;
-out:
+
return 0;
nla_put_failure:
@@ -1298,6 +1344,52 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int brport_fill_linkxstats(struct sk_buff *skb,
+ const struct net_device *dev,
+ int *prividx)
+{
+ struct net_bridge_port *p = br_port_get_rtnl(dev);
+ struct nlattr *nla __maybe_unused;
+ struct nlattr *nest;
+
+ if (!p)
+ return 0;
+
+ nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
+ if (!nest)
+ return -EMSGSIZE;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST,
+ sizeof(struct br_mcast_stats),
+ BRIDGE_XSTATS_PAD);
+ if (!nla) {
+ nla_nest_end(skb, nest);
+ return -EMSGSIZE;
+ }
+ br_multicast_get_stats(p->br, p, nla_data(nla));
+#endif
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
+ int *prividx, int attr)
+{
+ int ret = -EINVAL;
+
+ switch (attr) {
+ case IFLA_STATS_LINK_XSTATS:
+ ret = bridge_fill_linkxstats(skb, dev, prividx);
+ break;
+ case IFLA_STATS_LINK_XSTATS_SLAVE:
+ ret = brport_fill_linkxstats(skb, dev, prividx);
+ break;
+ }
+
+ return ret;
+}
+
static struct rtnl_af_ops br_af_ops __read_mostly = {
.family = AF_BRIDGE,
.get_link_af_size = br_get_link_af_size_filtered,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 52edecf3c..aac2a6e6b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -75,6 +75,12 @@ struct bridge_mcast_querier {
struct br_ip addr;
struct net_bridge_port __rcu *port;
};
+
+/* IGMP/MLD statistics */
+struct bridge_mcast_stats {
+ struct br_mcast_stats mstats;
+ struct u64_stats_sync syncp;
+};
#endif
struct br_vlan_stats {
@@ -229,6 +235,7 @@ struct net_bridge_port
struct bridge_mcast_own_query ip6_own_query;
#endif /* IS_ENABLED(CONFIG_IPV6) */
unsigned char multicast_router;
+ struct bridge_mcast_stats __percpu *mcast_stats;
struct timer_list multicast_router_timer;
struct hlist_head mglist;
struct hlist_node rlist;
@@ -315,6 +322,7 @@ struct net_bridge
u8 multicast_querier:1;
u8 multicast_query_use_ifaddr:1;
u8 has_ipv6_addr:1;
+ u8 multicast_stats_enabled:1;
u32 hash_elasticity;
u32 hash_max;
@@ -337,6 +345,7 @@ struct net_bridge
struct bridge_mcast_other_query ip4_other_query;
struct bridge_mcast_own_query ip4_own_query;
struct bridge_mcast_querier ip4_querier;
+ struct bridge_mcast_stats __percpu *mcast_stats;
#if IS_ENABLED(CONFIG_IPV6)
struct bridge_mcast_other_query ip6_other_query;
struct bridge_mcast_own_query ip6_own_query;
@@ -496,14 +505,12 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid);
/* br_forward.c */
-void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb);
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
-void br_forward(const struct net_bridge_port *to,
- struct sk_buff *skb, struct sk_buff *skb0);
+void br_forward(const struct net_bridge_port *to, struct sk_buff *skb,
+ bool local_rcv, bool local_orig);
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
-void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast);
-void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
- struct sk_buff *skb2, bool unicast);
+void br_flood(struct net_bridge *br, struct sk_buff *skb,
+ bool unicast, bool local_rcv, bool local_orig);
/* br_if.c */
void br_port_carrier_check(struct net_bridge_port *p);
@@ -543,7 +550,7 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
struct sk_buff *skb, u16 vid);
struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
struct sk_buff *skb, u16 vid);
-void br_multicast_add_port(struct net_bridge_port *port);
+int br_multicast_add_port(struct net_bridge_port *port);
void br_multicast_del_port(struct net_bridge_port *port);
void br_multicast_enable_port(struct net_bridge_port *port);
void br_multicast_disable_port(struct net_bridge_port *port);
@@ -551,10 +558,8 @@ void br_multicast_init(struct net_bridge *br);
void br_multicast_open(struct net_bridge *br);
void br_multicast_stop(struct net_bridge *br);
void br_multicast_dev_del(struct net_bridge *br);
-void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
- struct sk_buff *skb);
-void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
- struct sk_buff *skb, struct sk_buff *skb2);
+void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb, bool local_rcv, bool local_orig);
int br_multicast_set_router(struct net_bridge *br, unsigned long val);
int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
int br_multicast_toggle(struct net_bridge *br, unsigned long val);
@@ -576,6 +581,12 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
struct br_ip *group, int type, u8 flags);
void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
int type);
+void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
+ const struct sk_buff *skb, u8 type, u8 dir);
+int br_multicast_init_stats(struct net_bridge *br);
+void br_multicast_get_stats(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct br_mcast_stats *dest);
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
@@ -623,6 +634,11 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
return false;
}
}
+
+static inline int br_multicast_igmp_type(const struct sk_buff *skb)
+{
+ return BR_INPUT_SKB_CB(skb)->igmp;
+}
#else
static inline int br_multicast_rcv(struct net_bridge *br,
struct net_bridge_port *port,
@@ -638,8 +654,9 @@ static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
return NULL;
}
-static inline void br_multicast_add_port(struct net_bridge_port *port)
+static inline int br_multicast_add_port(struct net_bridge_port *port)
{
+ return 0;
}
static inline void br_multicast_del_port(struct net_bridge_port *port)
@@ -670,31 +687,47 @@ static inline void br_multicast_dev_del(struct net_bridge *br)
{
}
-static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
- struct sk_buff *skb)
+static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb,
+ bool local_rcv, bool local_orig)
{
}
-static inline void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
- struct sk_buff *skb,
- struct sk_buff *skb2)
-{
-}
static inline bool br_multicast_is_router(struct net_bridge *br)
{
return 0;
}
+
static inline bool br_multicast_querier_exists(struct net_bridge *br,
struct ethhdr *eth)
{
return false;
}
+
static inline void br_mdb_init(void)
{
}
+
static inline void br_mdb_uninit(void)
{
}
+
+static inline void br_multicast_count(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const struct sk_buff *skb,
+ u8 type, u8 dir)
+{
+}
+
+static inline int br_multicast_init_stats(struct net_bridge *br)
+{
+ return 0;
+}
+
+static inline int br_multicast_igmp_type(const struct sk_buff *skb)
+{
+ return 0;
+}
#endif
/* br_vlan.c */
@@ -942,7 +975,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
int br_set_forward_delay(struct net_bridge *br, unsigned long x);
int br_set_hello_time(struct net_bridge *br, unsigned long x);
int br_set_max_age(struct net_bridge *br, unsigned long x);
-int br_set_ageing_time(struct net_bridge *br, u32 ageing_time);
+int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);
/* br_stp_if.c */
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 9cb7044d0..9258b8ef1 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -570,7 +570,7 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
*
* Offloaded switch entries maybe more restrictive
*/
-int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
+int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
{
struct switchdev_attr attr = {
.orig_dev = br->dev,
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 984d46263..341caa0ca 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -55,7 +55,7 @@ void br_init_port(struct net_bridge_port *p)
netdev_err(p->dev, "failed to set HW ageing time\n");
}
-/* called under bridge lock */
+/* NO locks held */
void br_stp_enable_bridge(struct net_bridge *br)
{
struct net_bridge_port *p;
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index beb47071e..e120307c6 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -618,6 +618,30 @@ static ssize_t multicast_startup_query_interval_store(
return store_bridge_parm(d, buf, len, set_startup_query_interval);
}
static DEVICE_ATTR_RW(multicast_startup_query_interval);
+
+static ssize_t multicast_stats_enabled_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_stats_enabled);
+}
+
+static int set_stats_enabled(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_stats_enabled = !!val;
+ return 0;
+}
+
+static ssize_t multicast_stats_enabled_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_stats_enabled);
+}
+static DEVICE_ATTR_RW(multicast_stats_enabled);
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static ssize_t nf_call_iptables_show(
@@ -784,6 +808,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_multicast_query_interval.attr,
&dev_attr_multicast_query_response_interval.attr,
&dev_attr_multicast_startup_query_interval.attr,
+ &dev_attr_multicast_stats_enabled.attr,
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
&dev_attr_nf_call_iptables.attr,
diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 2a449b7ab..5fc4affd9 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -20,16 +20,16 @@ ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par)
__be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type;
if (info->bitmask & EBT_802_3_SAP) {
- if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP))
+ if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.ssap))
return false;
- if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP))
+ if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.dsap))
return false;
}
if (info->bitmask & EBT_802_3_TYPE) {
if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE))
return false;
- if (FWINV(info->type != type, EBT_802_3_TYPE))
+ if (NF_INVF(info, EBT_802_3_TYPE, info->type != type))
return false;
}
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index cd457b891..227142282 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -25,14 +25,14 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par)
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL)
return false;
- if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode !=
- ah->ar_op, EBT_ARP_OPCODE))
+ if ((info->bitmask & EBT_ARP_OPCODE) &&
+ NF_INVF(info, EBT_ARP_OPCODE, info->opcode != ah->ar_op))
return false;
- if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype !=
- ah->ar_hrd, EBT_ARP_HTYPE))
+ if ((info->bitmask & EBT_ARP_HTYPE) &&
+ NF_INVF(info, EBT_ARP_HTYPE, info->htype != ah->ar_hrd))
return false;
- if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype !=
- ah->ar_pro, EBT_ARP_PTYPE))
+ if ((info->bitmask & EBT_ARP_PTYPE) &&
+ NF_INVF(info, EBT_ARP_PTYPE, info->ptype != ah->ar_pro))
return false;
if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) {
@@ -51,21 +51,22 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par)
sizeof(daddr), &daddr);
if (dap == NULL)
return false;
- if (info->bitmask & EBT_ARP_SRC_IP &&
- FWINV(info->saddr != (*sap & info->smsk), EBT_ARP_SRC_IP))
+ if ((info->bitmask & EBT_ARP_SRC_IP) &&
+ NF_INVF(info, EBT_ARP_SRC_IP,
+ info->saddr != (*sap & info->smsk)))
return false;
- if (info->bitmask & EBT_ARP_DST_IP &&
- FWINV(info->daddr != (*dap & info->dmsk), EBT_ARP_DST_IP))
+ if ((info->bitmask & EBT_ARP_DST_IP) &&
+ NF_INVF(info, EBT_ARP_DST_IP,
+ info->daddr != (*dap & info->dmsk)))
return false;
- if (info->bitmask & EBT_ARP_GRAT &&
- FWINV(*dap != *sap, EBT_ARP_GRAT))
+ if ((info->bitmask & EBT_ARP_GRAT) &&
+ NF_INVF(info, EBT_ARP_GRAT, *dap != *sap))
return false;
}
if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) {
const unsigned char *mp;
unsigned char _mac[ETH_ALEN];
- uint8_t verdict, i;
if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER))
return false;
@@ -74,11 +75,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par)
sizeof(_mac), &_mac);
if (mp == NULL)
return false;
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (mp[i] ^ info->smaddr[i]) &
- info->smmsk[i];
- if (FWINV(verdict != 0, EBT_ARP_SRC_MAC))
+ if (NF_INVF(info, EBT_ARP_SRC_MAC,
+ !ether_addr_equal_masked(mp, info->smaddr,
+ info->smmsk)))
return false;
}
@@ -88,11 +87,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par)
sizeof(_mac), &_mac);
if (mp == NULL)
return false;
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (mp[i] ^ info->dmaddr[i]) &
- info->dmmsk[i];
- if (FWINV(verdict != 0, EBT_ARP_DST_MAC))
+ if (NF_INVF(info, EBT_ARP_DST_MAC,
+ !ether_addr_equal_masked(mp, info->dmaddr,
+ info->dmmsk)))
return false;
}
}
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index 23bca62d5..d06968bdf 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -36,19 +36,19 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL)
return false;
- if (info->bitmask & EBT_IP_TOS &&
- FWINV(info->tos != ih->tos, EBT_IP_TOS))
+ if ((info->bitmask & EBT_IP_TOS) &&
+ NF_INVF(info, EBT_IP_TOS, info->tos != ih->tos))
return false;
- if (info->bitmask & EBT_IP_SOURCE &&
- FWINV((ih->saddr & info->smsk) !=
- info->saddr, EBT_IP_SOURCE))
+ if ((info->bitmask & EBT_IP_SOURCE) &&
+ NF_INVF(info, EBT_IP_SOURCE,
+ (ih->saddr & info->smsk) != info->saddr))
return false;
if ((info->bitmask & EBT_IP_DEST) &&
- FWINV((ih->daddr & info->dmsk) !=
- info->daddr, EBT_IP_DEST))
+ NF_INVF(info, EBT_IP_DEST,
+ (ih->daddr & info->dmsk) != info->daddr))
return false;
if (info->bitmask & EBT_IP_PROTO) {
- if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO))
+ if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol))
return false;
if (!(info->bitmask & EBT_IP_DPORT) &&
!(info->bitmask & EBT_IP_SPORT))
@@ -61,16 +61,16 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
return false;
if (info->bitmask & EBT_IP_DPORT) {
u32 dst = ntohs(pptr->dst);
- if (FWINV(dst < info->dport[0] ||
- dst > info->dport[1],
- EBT_IP_DPORT))
+ if (NF_INVF(info, EBT_IP_DPORT,
+ dst < info->dport[0] ||
+ dst > info->dport[1]))
return false;
}
if (info->bitmask & EBT_IP_SPORT) {
u32 src = ntohs(pptr->src);
- if (FWINV(src < info->sport[0] ||
- src > info->sport[1],
- EBT_IP_SPORT))
+ if (NF_INVF(info, EBT_IP_SPORT,
+ src < info->sport[0] ||
+ src > info->sport[1]))
return false;
}
}
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 98de6e7fd..4617491be 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -45,15 +45,18 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h);
if (ih6 == NULL)
return false;
- if (info->bitmask & EBT_IP6_TCLASS &&
- FWINV(info->tclass != ipv6_get_dsfield(ih6), EBT_IP6_TCLASS))
+ if ((info->bitmask & EBT_IP6_TCLASS) &&
+ NF_INVF(info, EBT_IP6_TCLASS,
+ info->tclass != ipv6_get_dsfield(ih6)))
return false;
- if ((info->bitmask & EBT_IP6_SOURCE &&
- FWINV(ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk,
- &info->saddr), EBT_IP6_SOURCE)) ||
- (info->bitmask & EBT_IP6_DEST &&
- FWINV(ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk,
- &info->daddr), EBT_IP6_DEST)))
+ if (((info->bitmask & EBT_IP6_SOURCE) &&
+ NF_INVF(info, EBT_IP6_SOURCE,
+ ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk,
+ &info->saddr))) ||
+ ((info->bitmask & EBT_IP6_DEST) &&
+ NF_INVF(info, EBT_IP6_DEST,
+ ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk,
+ &info->daddr))))
return false;
if (info->bitmask & EBT_IP6_PROTO) {
uint8_t nexthdr = ih6->nexthdr;
@@ -63,7 +66,7 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off);
if (offset_ph == -1)
return false;
- if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
+ if (NF_INVF(info, EBT_IP6_PROTO, info->protocol != nexthdr))
return false;
if (!(info->bitmask & (EBT_IP6_DPORT |
EBT_IP6_SPORT | EBT_IP6_ICMP6)))
@@ -76,22 +79,24 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
return false;
if (info->bitmask & EBT_IP6_DPORT) {
u16 dst = ntohs(pptr->tcpudphdr.dst);
- if (FWINV(dst < info->dport[0] ||
- dst > info->dport[1], EBT_IP6_DPORT))
+ if (NF_INVF(info, EBT_IP6_DPORT,
+ dst < info->dport[0] ||
+ dst > info->dport[1]))
return false;
}
if (info->bitmask & EBT_IP6_SPORT) {
u16 src = ntohs(pptr->tcpudphdr.src);
- if (FWINV(src < info->sport[0] ||
- src > info->sport[1], EBT_IP6_SPORT))
+ if (NF_INVF(info, EBT_IP6_SPORT,
+ src < info->sport[0] ||
+ src > info->sport[1]))
return false;
}
if ((info->bitmask & EBT_IP6_ICMP6) &&
- FWINV(pptr->icmphdr.type < info->icmpv6_type[0] ||
- pptr->icmphdr.type > info->icmpv6_type[1] ||
- pptr->icmphdr.code < info->icmpv6_code[0] ||
- pptr->icmphdr.code > info->icmpv6_code[1],
- EBT_IP6_ICMP6))
+ NF_INVF(info, EBT_IP6_ICMP6,
+ pptr->icmphdr.type < info->icmpv6_type[0] ||
+ pptr->icmphdr.type > info->icmpv6_type[1] ||
+ pptr->icmphdr.code < info->icmpv6_code[0] ||
+ pptr->icmphdr.code > info->icmpv6_code[1]))
return false;
}
return true;
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 6b731e12e..3140eb912 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -17,24 +17,24 @@
#define BPDU_TYPE_TCN 0x80
struct stp_header {
- uint8_t dsap;
- uint8_t ssap;
- uint8_t ctrl;
- uint8_t pid;
- uint8_t vers;
- uint8_t type;
+ u8 dsap;
+ u8 ssap;
+ u8 ctrl;
+ u8 pid;
+ u8 vers;
+ u8 type;
};
struct stp_config_pdu {
- uint8_t flags;
- uint8_t root[8];
- uint8_t root_cost[4];
- uint8_t sender[8];
- uint8_t port[2];
- uint8_t msg_age[2];
- uint8_t max_age[2];
- uint8_t hello_time[2];
- uint8_t forward_delay[2];
+ u8 flags;
+ u8 root[8];
+ u8 root_cost[4];
+ u8 sender[8];
+ u8 port[2];
+ u8 msg_age[2];
+ u8 max_age[2];
+ u8 hello_time[2];
+ u8 forward_delay[2];
};
#define NR16(p) (p[0] << 8 | p[1])
@@ -44,76 +44,73 @@ static bool ebt_filter_config(const struct ebt_stp_info *info,
const struct stp_config_pdu *stpc)
{
const struct ebt_stp_config_info *c;
- uint16_t v16;
- uint32_t v32;
- int verdict, i;
+ u16 v16;
+ u32 v32;
c = &info->config;
if ((info->bitmask & EBT_STP_FLAGS) &&
- FWINV(c->flags != stpc->flags, EBT_STP_FLAGS))
+ NF_INVF(info, EBT_STP_FLAGS, c->flags != stpc->flags))
return false;
if (info->bitmask & EBT_STP_ROOTPRIO) {
v16 = NR16(stpc->root);
- if (FWINV(v16 < c->root_priol ||
- v16 > c->root_priou, EBT_STP_ROOTPRIO))
+ if (NF_INVF(info, EBT_STP_ROOTPRIO,
+ v16 < c->root_priol || v16 > c->root_priou))
return false;
}
if (info->bitmask & EBT_STP_ROOTADDR) {
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (stpc->root[2+i] ^ c->root_addr[i]) &
- c->root_addrmsk[i];
- if (FWINV(verdict != 0, EBT_STP_ROOTADDR))
+ if (NF_INVF(info, EBT_STP_ROOTADDR,
+ !ether_addr_equal_masked(&stpc->root[2],
+ c->root_addr,
+ c->root_addrmsk)))
return false;
}
if (info->bitmask & EBT_STP_ROOTCOST) {
v32 = NR32(stpc->root_cost);
- if (FWINV(v32 < c->root_costl ||
- v32 > c->root_costu, EBT_STP_ROOTCOST))
+ if (NF_INVF(info, EBT_STP_ROOTCOST,
+ v32 < c->root_costl || v32 > c->root_costu))
return false;
}
if (info->bitmask & EBT_STP_SENDERPRIO) {
v16 = NR16(stpc->sender);
- if (FWINV(v16 < c->sender_priol ||
- v16 > c->sender_priou, EBT_STP_SENDERPRIO))
+ if (NF_INVF(info, EBT_STP_SENDERPRIO,
+ v16 < c->sender_priol || v16 > c->sender_priou))
return false;
}
if (info->bitmask & EBT_STP_SENDERADDR) {
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) &
- c->sender_addrmsk[i];
- if (FWINV(verdict != 0, EBT_STP_SENDERADDR))
+ if (NF_INVF(info, EBT_STP_SENDERADDR,
+ !ether_addr_equal_masked(&stpc->sender[2],
+ c->sender_addr,
+ c->sender_addrmsk)))
return false;
}
if (info->bitmask & EBT_STP_PORT) {
v16 = NR16(stpc->port);
- if (FWINV(v16 < c->portl ||
- v16 > c->portu, EBT_STP_PORT))
+ if (NF_INVF(info, EBT_STP_PORT,
+ v16 < c->portl || v16 > c->portu))
return false;
}
if (info->bitmask & EBT_STP_MSGAGE) {
v16 = NR16(stpc->msg_age);
- if (FWINV(v16 < c->msg_agel ||
- v16 > c->msg_ageu, EBT_STP_MSGAGE))
+ if (NF_INVF(info, EBT_STP_MSGAGE,
+ v16 < c->msg_agel || v16 > c->msg_ageu))
return false;
}
if (info->bitmask & EBT_STP_MAXAGE) {
v16 = NR16(stpc->max_age);
- if (FWINV(v16 < c->max_agel ||
- v16 > c->max_ageu, EBT_STP_MAXAGE))
+ if (NF_INVF(info, EBT_STP_MAXAGE,
+ v16 < c->max_agel || v16 > c->max_ageu))
return false;
}
if (info->bitmask & EBT_STP_HELLOTIME) {
v16 = NR16(stpc->hello_time);
- if (FWINV(v16 < c->hello_timel ||
- v16 > c->hello_timeu, EBT_STP_HELLOTIME))
+ if (NF_INVF(info, EBT_STP_HELLOTIME,
+ v16 < c->hello_timel || v16 > c->hello_timeu))
return false;
}
if (info->bitmask & EBT_STP_FWDD) {
v16 = NR16(stpc->forward_delay);
- if (FWINV(v16 < c->forward_delayl ||
- v16 > c->forward_delayu, EBT_STP_FWDD))
+ if (NF_INVF(info, EBT_STP_FWDD,
+ v16 < c->forward_delayl || v16 > c->forward_delayu))
return false;
}
return true;
@@ -125,7 +122,7 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct ebt_stp_info *info = par->matchinfo;
const struct stp_header *sp;
struct stp_header _stph;
- const uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00};
+ const u8 header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00};
sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph);
if (sp == NULL)
@@ -135,8 +132,8 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (memcmp(sp, header, sizeof(header)))
return false;
- if (info->bitmask & EBT_STP_TYPE &&
- FWINV(info->type != sp->type, EBT_STP_TYPE))
+ if ((info->bitmask & EBT_STP_TYPE) &&
+ NF_INVF(info, EBT_STP_TYPE, info->type != sp->type))
return false;
if (sp->type == BPDU_TYPE_CONFIG &&
@@ -156,8 +153,8 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par)
static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
{
const struct ebt_stp_info *info = par->matchinfo;
- const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00};
- const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ const u8 bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00};
+ const u8 msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
const struct ebt_entry *e = par->entryinfo;
if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 5a61f3541..0833c251a 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -121,7 +121,6 @@ ebt_dev_check(const char *entry, const struct net_device *device)
return devname[i] != entry[i] && entry[i] != 1;
}
-#define FWINV2(bool, invflg) ((bool) ^ !!(e->invflags & invflg))
/* process standard matches */
static inline int
ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
@@ -130,7 +129,6 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
const struct ethhdr *h = eth_hdr(skb);
const struct net_bridge_port *p;
__be16 ethproto;
- int verdict, i;
if (skb_vlan_tag_present(skb))
ethproto = htons(ETH_P_8021Q);
@@ -138,38 +136,36 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
ethproto = h->h_proto;
if (e->bitmask & EBT_802_3) {
- if (FWINV2(eth_proto_is_802_3(ethproto), EBT_IPROTO))
+ if (NF_INVF(e, EBT_IPROTO, eth_proto_is_802_3(ethproto)))
return 1;
} else if (!(e->bitmask & EBT_NOPROTO) &&
- FWINV2(e->ethproto != ethproto, EBT_IPROTO))
+ NF_INVF(e, EBT_IPROTO, e->ethproto != ethproto))
return 1;
- if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN))
+ if (NF_INVF(e, EBT_IIN, ebt_dev_check(e->in, in)))
return 1;
- if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
+ if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out)))
return 1;
/* rcu_read_lock()ed by nf_hook_slow */
if (in && (p = br_port_get_rcu(in)) != NULL &&
- FWINV2(ebt_dev_check(e->logical_in, p->br->dev), EBT_ILOGICALIN))
+ NF_INVF(e, EBT_ILOGICALIN,
+ ebt_dev_check(e->logical_in, p->br->dev)))
return 1;
if (out && (p = br_port_get_rcu(out)) != NULL &&
- FWINV2(ebt_dev_check(e->logical_out, p->br->dev), EBT_ILOGICALOUT))
+ NF_INVF(e, EBT_ILOGICALOUT,
+ ebt_dev_check(e->logical_out, p->br->dev)))
return 1;
if (e->bitmask & EBT_SOURCEMAC) {
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (h->h_source[i] ^ e->sourcemac[i]) &
- e->sourcemsk[i];
- if (FWINV2(verdict != 0, EBT_ISOURCE))
+ if (NF_INVF(e, EBT_ISOURCE,
+ !ether_addr_equal_masked(h->h_source, e->sourcemac,
+ e->sourcemsk)))
return 1;
}
if (e->bitmask & EBT_DESTMAC) {
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (h->h_dest[i] ^ e->destmac[i]) &
- e->destmsk[i];
- if (FWINV2(verdict != 0, EBT_IDEST))
+ if (NF_INVF(e, EBT_IDEST,
+ !ether_addr_equal_masked(h->h_dest, e->destmac,
+ e->destmsk)))
return 1;
}
return 0;
@@ -372,6 +368,8 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) {
+ if (!IS_ERR(match))
+ module_put(match->me);
request_module("ebt_%s", m->u.name);
match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
}
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 4b901d9f2..ad47a921b 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -86,6 +86,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = {
.init = nft_meta_set_init,
.destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
+ .validate = nft_meta_set_validate,
};
static const struct nft_expr_ops *
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 77f7e7a9e..0b77ffbc2 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -72,7 +72,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net,
nft_reject_br_push_etherhdr(oldskb, nskb);
- br_deliver(br_port_get_rcu(dev), nskb);
+ br_forward(br_port_get_rcu(dev), nskb, false, true);
}
static void nft_reject_br_send_v4_unreach(struct net *net,
@@ -140,7 +140,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net,
nft_reject_br_push_etherhdr(oldskb, nskb);
- br_deliver(br_port_get_rcu(dev), nskb);
+ br_forward(br_port_get_rcu(dev), nskb, false, true);
}
static void nft_reject_br_send_v6_tcp_reset(struct net *net,
@@ -174,7 +174,7 @@ static void nft_reject_br_send_v6_tcp_reset(struct net *net,
nft_reject_br_push_etherhdr(oldskb, nskb);
- br_deliver(br_port_get_rcu(dev), nskb);
+ br_forward(br_port_get_rcu(dev), nskb, false, true);
}
static bool reject6_br_csum_ok(struct sk_buff *skb, int hook)
@@ -255,7 +255,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net,
nft_reject_br_push_etherhdr(oldskb, nskb);
- br_deliver(br_port_get_rcu(dev), nskb);
+ br_forward(br_port_get_rcu(dev), nskb, false, true);
}
static void nft_reject_bridge_eval(const struct nft_expr *expr,
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 67a4a36fe..3408ed51b 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -13,7 +13,6 @@
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/if_ether.h>
-#include <linux/moduleparam.h>
#include <linux/ip.h>
#include <linux/sched.h>
#include <linux/sockios.h>
diff --git a/net/can/Makefile b/net/can/Makefile
index cef49eb1f..10936754e 100644
--- a/net/can/Makefile
+++ b/net/can/Makefile
@@ -3,7 +3,8 @@
#
obj-$(CONFIG_CAN) += can.o
-can-y := af_can.o proc.o
+can-y := af_can.o
+can-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_CAN_RAW) += can-raw.o
can-raw-y := raw.o
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 166d43619..1108079d9 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -911,14 +911,14 @@ static __init int can_init(void)
if (!rcv_cache)
return -ENOMEM;
- if (stats_timer) {
+ if (IS_ENABLED(CONFIG_PROC_FS)) {
+ if (stats_timer) {
/* the statistics are updated every second (timer triggered) */
- setup_timer(&can_stattimer, can_stat_update, 0);
- mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
- } else
- can_stattimer.function = NULL;
-
- can_init_proc();
+ setup_timer(&can_stattimer, can_stat_update, 0);
+ mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
+ }
+ can_init_proc();
+ }
/* protocol register */
sock_register(&can_family_ops);
@@ -933,10 +933,12 @@ static __exit void can_exit(void)
{
struct net_device *dev;
- if (stats_timer)
- del_timer_sync(&can_stattimer);
+ if (IS_ENABLED(CONFIG_PROC_FS)) {
+ if (stats_timer)
+ del_timer_sync(&can_stattimer);
- can_remove_proc();
+ can_remove_proc();
+ }
/* protocol unregister */
dev_remove_pack(&canfd_packet);
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 6863310d6..8e999ffdf 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1,7 +1,7 @@
/*
* bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
*
- * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * Copyright (c) 2002-2016 Volkswagen Group Electronic Research
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -67,27 +67,31 @@
*/
#define MAX_NFRAMES 256
-/* use of last_frames[index].can_dlc */
+/* use of last_frames[index].flags */
#define RX_RECV 0x40 /* received data for this element */
#define RX_THR 0x80 /* element not been sent due to throttle feature */
-#define BCM_CAN_DLC_MASK 0x0F /* clean private flags in can_dlc by masking */
+#define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */
/* get best masking value for can_rx_register() for a given single can_id */
#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \
(CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
(CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
-#define CAN_BCM_VERSION CAN_VERSION
+#define CAN_BCM_VERSION "20160617"
MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
MODULE_ALIAS("can-proto-2");
-/* easy access to can_frame payload */
-static inline u64 GET_U64(const struct can_frame *cp)
+/*
+ * easy access to the first 64 bit of can(fd)_frame payload. cp->data is
+ * 64 bit aligned so the offset has to be multiples of 8 which is ensured
+ * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler().
+ */
+static inline u64 get_u64(const struct canfd_frame *cp, int offset)
{
- return *(u64 *)cp->data;
+ return *(u64 *)(cp->data + offset);
}
struct bcm_op {
@@ -101,13 +105,14 @@ struct bcm_op {
struct tasklet_struct tsklet, thrtsklet;
ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
int rx_ifindex;
+ int cfsiz;
u32 count;
u32 nframes;
u32 currframe;
- struct can_frame *frames;
- struct can_frame *last_frames;
- struct can_frame sframe;
- struct can_frame last_sframe;
+ struct canfd_frame *frames;
+ struct canfd_frame *last_frames;
+ struct canfd_frame sframe;
+ struct canfd_frame last_sframe;
struct sock *sk;
struct net_device *rx_reg_dev;
};
@@ -136,7 +141,7 @@ static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv)
return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
}
-#define CFSIZ sizeof(struct can_frame)
+#define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU)
#define OPSIZ sizeof(struct bcm_op)
#define MHSIZ sizeof(struct bcm_msg_head)
@@ -183,43 +188,50 @@ static int bcm_proc_show(struct seq_file *m, void *v)
if (!op->frames_abs)
continue;
- seq_printf(m, "rx_op: %03X %-5s ",
- op->can_id, bcm_proc_getifname(ifname, op->ifindex));
- seq_printf(m, "[%u]%c ", op->nframes,
- (op->flags & RX_CHECK_DLC)?'d':' ');
+ seq_printf(m, "rx_op: %03X %-5s ", op->can_id,
+ bcm_proc_getifname(ifname, op->ifindex));
+
+ if (op->flags & CAN_FD_FRAME)
+ seq_printf(m, "(%u)", op->nframes);
+ else
+ seq_printf(m, "[%u]", op->nframes);
+
+ seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');
+
if (op->kt_ival1.tv64)
seq_printf(m, "timeo=%lld ",
- (long long)
- ktime_to_us(op->kt_ival1));
+ (long long)ktime_to_us(op->kt_ival1));
if (op->kt_ival2.tv64)
seq_printf(m, "thr=%lld ",
- (long long)
- ktime_to_us(op->kt_ival2));
+ (long long)ktime_to_us(op->kt_ival2));
seq_printf(m, "# recv %ld (%ld) => reduction: ",
- op->frames_filtered, op->frames_abs);
+ op->frames_filtered, op->frames_abs);
reduction = 100 - (op->frames_filtered * 100) / op->frames_abs;
seq_printf(m, "%s%ld%%\n",
- (reduction == 100)?"near ":"", reduction);
+ (reduction == 100) ? "near " : "", reduction);
}
list_for_each_entry(op, &bo->tx_ops, list) {
- seq_printf(m, "tx_op: %03X %s [%u] ",
- op->can_id,
- bcm_proc_getifname(ifname, op->ifindex),
- op->nframes);
+ seq_printf(m, "tx_op: %03X %s ", op->can_id,
+ bcm_proc_getifname(ifname, op->ifindex));
+
+ if (op->flags & CAN_FD_FRAME)
+ seq_printf(m, "(%u) ", op->nframes);
+ else
+ seq_printf(m, "[%u] ", op->nframes);
if (op->kt_ival1.tv64)
seq_printf(m, "t1=%lld ",
- (long long) ktime_to_us(op->kt_ival1));
+ (long long)ktime_to_us(op->kt_ival1));
if (op->kt_ival2.tv64)
seq_printf(m, "t2=%lld ",
- (long long) ktime_to_us(op->kt_ival2));
+ (long long)ktime_to_us(op->kt_ival2));
seq_printf(m, "# sent %ld\n", op->frames_abs);
}
@@ -248,7 +260,7 @@ static void bcm_can_tx(struct bcm_op *op)
{
struct sk_buff *skb;
struct net_device *dev;
- struct can_frame *cf = &op->frames[op->currframe];
+ struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe;
/* no target device? => exit */
if (!op->ifindex)
@@ -260,7 +272,7 @@ static void bcm_can_tx(struct bcm_op *op)
return;
}
- skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), gfp_any());
+ skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any());
if (!skb)
goto out;
@@ -268,7 +280,7 @@ static void bcm_can_tx(struct bcm_op *op)
can_skb_prv(skb)->ifindex = dev->ifindex;
can_skb_prv(skb)->skbcnt = 0;
- memcpy(skb_put(skb, CFSIZ), cf, CFSIZ);
+ memcpy(skb_put(skb, op->cfsiz), cf, op->cfsiz);
/* send with loopback */
skb->dev = dev;
@@ -282,7 +294,7 @@ static void bcm_can_tx(struct bcm_op *op)
/* reached last frame? */
if (op->currframe >= op->nframes)
op->currframe = 0;
- out:
+out:
dev_put(dev);
}
@@ -291,13 +303,13 @@ static void bcm_can_tx(struct bcm_op *op)
* (consisting of bcm_msg_head + x CAN frames)
*/
static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
- struct can_frame *frames, int has_timestamp)
+ struct canfd_frame *frames, int has_timestamp)
{
struct sk_buff *skb;
- struct can_frame *firstframe;
+ struct canfd_frame *firstframe;
struct sockaddr_can *addr;
struct sock *sk = op->sk;
- unsigned int datalen = head->nframes * CFSIZ;
+ unsigned int datalen = head->nframes * op->cfsiz;
int err;
skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
@@ -307,19 +319,19 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
memcpy(skb_put(skb, sizeof(*head)), head, sizeof(*head));
if (head->nframes) {
- /* can_frames starting here */
- firstframe = (struct can_frame *)skb_tail_pointer(skb);
+ /* CAN frames starting here */
+ firstframe = (struct canfd_frame *)skb_tail_pointer(skb);
memcpy(skb_put(skb, datalen), frames, datalen);
/*
- * the BCM uses the can_dlc-element of the can_frame
+ * the BCM uses the flags-element of the canfd_frame
* structure for internal purposes. This is only
* relevant for updates that are generated by the
* BCM, where nframes is 1
*/
if (head->nframes == 1)
- firstframe->can_dlc &= BCM_CAN_DLC_MASK;
+ firstframe->flags &= BCM_CAN_FLAGS_MASK;
}
if (has_timestamp) {
@@ -406,7 +418,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
/*
* bcm_rx_changed - create a RX_CHANGED notification due to changed content
*/
-static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
+static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data)
{
struct bcm_msg_head head;
@@ -418,7 +430,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
op->frames_filtered = op->frames_abs = 0;
/* this element is not throttled anymore */
- data->can_dlc &= (BCM_CAN_DLC_MASK|RX_RECV);
+ data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV);
head.opcode = RX_CHANGED;
head.flags = op->flags;
@@ -437,13 +449,13 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
* 2. send a notification to the user (if possible)
*/
static void bcm_rx_update_and_send(struct bcm_op *op,
- struct can_frame *lastdata,
- const struct can_frame *rxdata)
+ struct canfd_frame *lastdata,
+ const struct canfd_frame *rxdata)
{
- memcpy(lastdata, rxdata, CFSIZ);
+ memcpy(lastdata, rxdata, op->cfsiz);
/* mark as used and throttled by default */
- lastdata->can_dlc |= (RX_RECV|RX_THR);
+ lastdata->flags |= (RX_RECV|RX_THR);
/* throttling mode inactive ? */
if (!op->kt_ival2.tv64) {
@@ -481,33 +493,36 @@ rx_changed_settime:
* received data stored in op->last_frames[]
*/
static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
- const struct can_frame *rxdata)
+ const struct canfd_frame *rxdata)
{
+ struct canfd_frame *cf = op->frames + op->cfsiz * index;
+ struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
+ int i;
+
/*
- * no one uses the MSBs of can_dlc for comparison,
+ * no one uses the MSBs of flags for comparison,
* so we use it here to detect the first time of reception
*/
- if (!(op->last_frames[index].can_dlc & RX_RECV)) {
+ if (!(lcf->flags & RX_RECV)) {
/* received data for the first time => send update to user */
- bcm_rx_update_and_send(op, &op->last_frames[index], rxdata);
+ bcm_rx_update_and_send(op, lcf, rxdata);
return;
}
- /* do a real check in can_frame data section */
-
- if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) !=
- (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) {
- bcm_rx_update_and_send(op, &op->last_frames[index], rxdata);
- return;
+ /* do a real check in CAN frame data section */
+ for (i = 0; i < rxdata->len; i += 8) {
+ if ((get_u64(cf, i) & get_u64(rxdata, i)) !=
+ (get_u64(cf, i) & get_u64(lcf, i))) {
+ bcm_rx_update_and_send(op, lcf, rxdata);
+ return;
+ }
}
if (op->flags & RX_CHECK_DLC) {
- /* do a real check in can_frame dlc */
- if (rxdata->can_dlc != (op->last_frames[index].can_dlc &
- BCM_CAN_DLC_MASK)) {
- bcm_rx_update_and_send(op, &op->last_frames[index],
- rxdata);
+ /* do a real check in CAN frame length */
+ if (rxdata->len != lcf->len) {
+ bcm_rx_update_and_send(op, lcf, rxdata);
return;
}
}
@@ -556,8 +571,8 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
/* if user wants to be informed, when cyclic CAN-Messages come back */
if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
- /* clear received can_frames to indicate 'nothing received' */
- memset(op->last_frames, 0, op->nframes * CFSIZ);
+ /* clear received CAN frames to indicate 'nothing received' */
+ memset(op->last_frames, 0, op->nframes * op->cfsiz);
}
return HRTIMER_NORESTART;
@@ -569,9 +584,11 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
unsigned int index)
{
- if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) {
+ struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
+
+ if ((op->last_frames) && (lcf->flags & RX_THR)) {
if (update)
- bcm_rx_changed(op, &op->last_frames[index]);
+ bcm_rx_changed(op, lcf);
return 1;
}
return 0;
@@ -636,15 +653,19 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
static void bcm_rx_handler(struct sk_buff *skb, void *data)
{
struct bcm_op *op = (struct bcm_op *)data;
- const struct can_frame *rxframe = (struct can_frame *)skb->data;
+ const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data;
unsigned int i;
- /* disable timeout */
- hrtimer_cancel(&op->timer);
-
if (op->can_id != rxframe->can_id)
return;
+ /* make sure to handle the correct frame type (CAN / CAN FD) */
+ if (skb->len != op->cfsiz)
+ return;
+
+ /* disable timeout */
+ hrtimer_cancel(&op->timer);
+
/* save rx timestamp */
op->rx_stamp = skb->tstamp;
/* save originator for recvfrom() */
@@ -675,13 +696,14 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
* multiplex compare
*
* find the first multiplex mask that fits.
- * Remark: The MUX-mask is stored in index 0
+ * Remark: The MUX-mask is stored in index 0 - but only the
+ * first 64 bits of the frame data[] are relevant (CAN FD)
*/
for (i = 1; i < op->nframes; i++) {
- if ((GET_U64(&op->frames[0]) & GET_U64(rxframe)) ==
- (GET_U64(&op->frames[0]) &
- GET_U64(&op->frames[i]))) {
+ if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) ==
+ (get_u64(op->frames, 0) &
+ get_u64(op->frames + op->cfsiz * i, 0))) {
bcm_rx_cmp_to_index(op, i, rxframe);
break;
}
@@ -695,13 +717,14 @@ rx_starttimer:
/*
* helpers for bcm_op handling: find & delete bcm [rx|tx] op elements
*/
-static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id,
- int ifindex)
+static struct bcm_op *bcm_find_op(struct list_head *ops,
+ struct bcm_msg_head *mh, int ifindex)
{
struct bcm_op *op;
list_for_each_entry(op, ops, list) {
- if ((op->can_id == can_id) && (op->ifindex == ifindex))
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME))
return op;
}
@@ -744,12 +767,14 @@ static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
/*
* bcm_delete_rx_op - find and remove a rx op (returns number of removed ops)
*/
-static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex)
+static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
+ int ifindex)
{
struct bcm_op *op, *n;
list_for_each_entry_safe(op, n, ops, list) {
- if ((op->can_id == can_id) && (op->ifindex == ifindex)) {
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
/*
* Don't care if we're bound or not (due to netdev
@@ -789,12 +814,14 @@ static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex)
/*
* bcm_delete_tx_op - find and remove a tx op (returns number of removed ops)
*/
-static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex)
+static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh,
+ int ifindex)
{
struct bcm_op *op, *n;
list_for_each_entry_safe(op, n, ops, list) {
- if ((op->can_id == can_id) && (op->ifindex == ifindex)) {
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
list_del(&op->list);
bcm_remove_op(op);
return 1; /* done */
@@ -810,7 +837,7 @@ static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex)
static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head,
int ifindex)
{
- struct bcm_op *op = bcm_find_op(ops, msg_head->can_id, ifindex);
+ struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex);
if (!op)
return -EINVAL;
@@ -835,6 +862,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
{
struct bcm_sock *bo = bcm_sk(sk);
struct bcm_op *op;
+ struct canfd_frame *cf;
unsigned int i;
int err;
@@ -842,39 +870,46 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (!ifindex)
return -ENODEV;
- /* check nframes boundaries - we need at least one can_frame */
+ /* check nframes boundaries - we need at least one CAN frame */
if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
return -EINVAL;
/* check the given can_id */
- op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex);
-
+ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex);
if (op) {
/* update existing BCM operation */
/*
- * Do we need more space for the can_frames than currently
+ * Do we need more space for the CAN frames than currently
* allocated? -> This is a _really_ unusual use-case and
* therefore (complexity / locking) it is not supported.
*/
if (msg_head->nframes > op->nframes)
return -E2BIG;
- /* update can_frames content */
+ /* update CAN frames content */
for (i = 0; i < msg_head->nframes; i++) {
- err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ);
- if (op->frames[i].can_dlc > 8)
- err = -EINVAL;
+ cf = op->frames + op->cfsiz * i;
+ err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
+
+ if (op->flags & CAN_FD_FRAME) {
+ if (cf->len > 64)
+ err = -EINVAL;
+ } else {
+ if (cf->len > 8)
+ err = -EINVAL;
+ }
if (err < 0)
return err;
if (msg_head->flags & TX_CP_CAN_ID) {
/* copy can_id into frame */
- op->frames[i].can_id = msg_head->can_id;
+ cf->can_id = msg_head->can_id;
}
}
+ op->flags = msg_head->flags;
} else {
/* insert new BCM operation for the given can_id */
@@ -883,11 +918,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (!op)
return -ENOMEM;
- op->can_id = msg_head->can_id;
+ op->can_id = msg_head->can_id;
+ op->cfsiz = CFSIZ(msg_head->flags);
+ op->flags = msg_head->flags;
- /* create array for can_frames and copy the data */
+ /* create array for CAN frames and copy the data */
if (msg_head->nframes > 1) {
- op->frames = kmalloc(msg_head->nframes * CFSIZ,
+ op->frames = kmalloc(msg_head->nframes * op->cfsiz,
GFP_KERNEL);
if (!op->frames) {
kfree(op);
@@ -897,10 +934,17 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->frames = &op->sframe;
for (i = 0; i < msg_head->nframes; i++) {
- err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ);
- if (op->frames[i].can_dlc > 8)
- err = -EINVAL;
+ cf = op->frames + op->cfsiz * i;
+ err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
+
+ if (op->flags & CAN_FD_FRAME) {
+ if (cf->len > 64)
+ err = -EINVAL;
+ } else {
+ if (cf->len > 8)
+ err = -EINVAL;
+ }
if (err < 0) {
if (op->frames != &op->sframe)
@@ -911,7 +955,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (msg_head->flags & TX_CP_CAN_ID) {
/* copy can_id into frame */
- op->frames[i].can_id = msg_head->can_id;
+ cf->can_id = msg_head->can_id;
}
}
@@ -946,8 +990,6 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
/* check flags */
- op->flags = msg_head->flags;
-
if (op->flags & TX_RESET_MULTI_IDX) {
/* start multiple frame transmission with index 0 */
op->currframe = 0;
@@ -968,7 +1010,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (op->flags & STARTTIMER) {
hrtimer_cancel(&op->timer);
- /* spec: send can_frame when starting timer */
+ /* spec: send CAN frame when starting timer */
op->flags |= TX_ANNOUNCE;
}
@@ -981,7 +1023,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (op->flags & STARTTIMER)
bcm_tx_start_timer(op);
- return msg_head->nframes * CFSIZ + MHSIZ;
+ return msg_head->nframes * op->cfsiz + MHSIZ;
}
/*
@@ -1012,12 +1054,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
return -EINVAL;
/* check the given can_id */
- op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex);
+ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex);
if (op) {
/* update existing BCM operation */
/*
- * Do we need more space for the can_frames than currently
+ * Do we need more space for the CAN frames than currently
* allocated? -> This is a _really_ unusual use-case and
* therefore (complexity / locking) it is not supported.
*/
@@ -1025,17 +1067,18 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
return -E2BIG;
if (msg_head->nframes) {
- /* update can_frames content */
+ /* update CAN frames content */
err = memcpy_from_msg((u8 *)op->frames, msg,
- msg_head->nframes * CFSIZ);
+ msg_head->nframes * op->cfsiz);
if (err < 0)
return err;
/* clear last_frames to indicate 'nothing received' */
- memset(op->last_frames, 0, msg_head->nframes * CFSIZ);
+ memset(op->last_frames, 0, msg_head->nframes * op->cfsiz);
}
op->nframes = msg_head->nframes;
+ op->flags = msg_head->flags;
/* Only an update -> do not call can_rx_register() */
do_rx_register = 0;
@@ -1046,20 +1089,22 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (!op)
return -ENOMEM;
- op->can_id = msg_head->can_id;
- op->nframes = msg_head->nframes;
+ op->can_id = msg_head->can_id;
+ op->nframes = msg_head->nframes;
+ op->cfsiz = CFSIZ(msg_head->flags);
+ op->flags = msg_head->flags;
if (msg_head->nframes > 1) {
- /* create array for can_frames and copy the data */
- op->frames = kmalloc(msg_head->nframes * CFSIZ,
+ /* create array for CAN frames and copy the data */
+ op->frames = kmalloc(msg_head->nframes * op->cfsiz,
GFP_KERNEL);
if (!op->frames) {
kfree(op);
return -ENOMEM;
}
- /* create and init array for received can_frames */
- op->last_frames = kzalloc(msg_head->nframes * CFSIZ,
+ /* create and init array for received CAN frames */
+ op->last_frames = kzalloc(msg_head->nframes * op->cfsiz,
GFP_KERNEL);
if (!op->last_frames) {
kfree(op->frames);
@@ -1074,7 +1119,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (msg_head->nframes) {
err = memcpy_from_msg((u8 *)op->frames, msg,
- msg_head->nframes * CFSIZ);
+ msg_head->nframes * op->cfsiz);
if (err < 0) {
if (op->frames != &op->sframe)
kfree(op->frames);
@@ -1116,7 +1161,6 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
} /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */
/* check flags */
- op->flags = msg_head->flags;
if (op->flags & RX_RTR_FRAME) {
@@ -1188,13 +1232,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
}
}
- return msg_head->nframes * CFSIZ + MHSIZ;
+ return msg_head->nframes * op->cfsiz + MHSIZ;
}
/*
* bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg)
*/
-static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
+static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk,
+ int cfsiz)
{
struct sk_buff *skb;
struct net_device *dev;
@@ -1204,13 +1249,13 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
if (!ifindex)
return -ENODEV;
- skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), GFP_KERNEL);
+ skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL);
if (!skb)
return -ENOMEM;
can_skb_reserve(skb);
- err = memcpy_from_msg(skb_put(skb, CFSIZ), msg, CFSIZ);
+ err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz);
if (err < 0) {
kfree_skb(skb);
return err;
@@ -1232,7 +1277,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
if (err)
return err;
- return CFSIZ + MHSIZ;
+ return cfsiz + MHSIZ;
}
/*
@@ -1244,13 +1289,23 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
struct bcm_sock *bo = bcm_sk(sk);
int ifindex = bo->ifindex; /* default ifindex for this bcm_op */
struct bcm_msg_head msg_head;
+ int cfsiz;
int ret; /* read bytes or error codes as return value */
if (!bo->bound)
return -ENOTCONN;
/* check for valid message length from userspace */
- if (size < MHSIZ || (size - MHSIZ) % CFSIZ)
+ if (size < MHSIZ)
+ return -EINVAL;
+
+ /* read message head information */
+ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
+ if (ret < 0)
+ return ret;
+
+ cfsiz = CFSIZ(msg_head.flags);
+ if ((size - MHSIZ) % cfsiz)
return -EINVAL;
/* check for alternative ifindex for this bcm_op */
@@ -1284,12 +1339,6 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}
}
- /* read message head information */
-
- ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
- if (ret < 0)
- return ret;
-
lock_sock(sk);
switch (msg_head.opcode) {
@@ -1303,14 +1352,14 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
break;
case TX_DELETE:
- if (bcm_delete_tx_op(&bo->tx_ops, msg_head.can_id, ifindex))
+ if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex))
ret = MHSIZ;
else
ret = -EINVAL;
break;
case RX_DELETE:
- if (bcm_delete_rx_op(&bo->rx_ops, msg_head.can_id, ifindex))
+ if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex))
ret = MHSIZ;
else
ret = -EINVAL;
@@ -1329,11 +1378,11 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
break;
case TX_SEND:
- /* we need exactly one can_frame behind the msg head */
- if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ))
+ /* we need exactly one CAN frame behind the msg head */
+ if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ))
ret = -EINVAL;
else
- ret = bcm_tx_send(msg, ifindex, sk);
+ ret = bcm_tx_send(msg, ifindex, sk, cfsiz);
break;
default:
diff --git a/net/can/proc.c b/net/can/proc.c
index 1a19b985a..85ef7bb0f 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -517,8 +517,7 @@ void can_init_proc(void)
can_dir = proc_mkdir("can", init_net.proc_net);
if (!can_dir) {
- printk(KERN_INFO "can: failed to create /proc/net/can . "
- "CONFIG_PROC_FS missing?\n");
+ pr_info("can: failed to create /proc/net/can.\n");
return;
}
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 958d98569..84cbed630 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
crypto.o armor.o \
auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o \
- pagevec.o snapshot.o
+ pagevec.o snapshot.o string_table.o
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 55d2bfee1..bddfcf6f0 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -747,6 +747,8 @@ out:
static void __exit exit_ceph_lib(void)
{
dout("exit_ceph_lib\n");
+ WARN_ON(!ceph_strings_empty());
+
ceph_osdc_cleanup();
ceph_msgr_exit();
ceph_crypto_shutdown();
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 41466ccb9..7d54e944d 100644
--- a/net/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -9,9 +9,9 @@
*/
int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
{
- __u32 su = le32_to_cpu(layout->fl_stripe_unit);
- __u32 sc = le32_to_cpu(layout->fl_stripe_count);
- __u32 os = le32_to_cpu(layout->fl_object_size);
+ __u32 su = layout->stripe_unit;
+ __u32 sc = layout->stripe_count;
+ __u32 os = layout->object_size;
/* stripe unit, object size must be non-zero, 64k increment */
if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
@@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
return 1;
}
+void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
+ fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
+ fl->object_size = le32_to_cpu(legacy->fl_object_size);
+ fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
+ if (fl->pool_id == 0)
+ fl->pool_id = -1;
+}
+EXPORT_SYMBOL(ceph_file_layout_from_legacy);
+
+void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
+ legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
+ legacy->fl_object_size = cpu_to_le32(fl->object_size);
+ if (fl->pool_id >= 0)
+ legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
+ else
+ legacy->fl_pg_pool = 0;
+}
+EXPORT_SYMBOL(ceph_file_layout_to_legacy);
int ceph_flags_to_mode(int flags)
{
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index e77b04ca7..c62b2b029 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
seq_printf(s, "]/%d\t[", t->up.primary);
for (i = 0; i < t->acting.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
- seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
- t->target_oid.name_len, t->target_oid.name, t->flags);
+ seq_printf(s, "]/%d\t", t->acting.primary);
+ if (t->target_oloc.pool_ns) {
+ seq_printf(s, "%*pE/%*pE\t0x%x",
+ (int)t->target_oloc.pool_ns->len,
+ t->target_oloc.pool_ns->str,
+ t->target_oid.name_len, t->target_oid.name, t->flags);
+ } else {
+ seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len,
+ t->target_oid.name, t->flags);
+ }
if (t->paused)
seq_puts(s, "\tP");
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 37c38a7fb..ef34a0271 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
}
const char *ceph_sub_str[] = {
- [CEPH_SUB_MDSMAP] = "mdsmap",
[CEPH_SUB_MONMAP] = "monmap",
[CEPH_SUB_OSDMAP] = "osdmap",
+ [CEPH_SUB_FSMAP] = "fsmap.user",
+ [CEPH_SUB_MDSMAP] = "mdsmap",
};
/*
@@ -573,7 +574,7 @@ static void complete_generic_request(struct ceph_mon_generic_request *req)
put_generic_request(req);
}
-void cancel_generic_request(struct ceph_mon_generic_request *req)
+static void cancel_generic_request(struct ceph_mon_generic_request *req)
{
struct ceph_mon_client *monc = req->monc;
struct ceph_mon_generic_request *lookup_req;
@@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
case CEPH_MSG_MON_MAP:
case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP:
+ case CEPH_MSG_FS_MAP_USER:
m = ceph_msg_new(type, front_len, GFP_NOFS, false);
if (!m)
return NULL; /* ENOMEM--return skip == 0 */
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index ddec1c10a..aaed59a47 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/vmalloc.h>
+#include <linux/ceph/messenger.h>
#include <linux/ceph/msgpool.h>
static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index aee117f83..a97e7b506 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest,
static void target_destroy(struct ceph_osd_request_target *t)
{
ceph_oid_destroy(&t->base_oid);
+ ceph_oloc_destroy(&t->base_oloc);
ceph_oid_destroy(&t->target_oid);
+ ceph_oloc_destroy(&t->target_oloc);
}
/*
@@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
}
EXPORT_SYMBOL(ceph_osdc_alloc_request);
+static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc)
+{
+ return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
+}
+
int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
{
struct ceph_osd_client *osdc = req->r_osdc;
@@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
int msg_size;
WARN_ON(ceph_oid_empty(&req->r_base_oid));
+ WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
/* create request message */
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
- msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
msg_size += 4 + req->r_base_oid.name_len; /* oid */
msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
@@ -932,7 +941,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
osd_req_op_init(req, which, opcode, 0);
} else {
- u32 object_size = le32_to_cpu(layout->fl_object_size);
+ u32 object_size = layout->object_size;
u32 object_base = off - objoff;
if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
if (truncate_size <= object_base) {
@@ -948,7 +957,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
}
req->r_flags = flags;
- req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+ req->r_base_oloc.pool = layout->pool_id;
+ req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
req->r_snapid = vino.snap;
@@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
p += sizeof(req->r_replay_version);
/* oloc */
- ceph_encode_8(&p, 4);
- ceph_encode_8(&p, 4);
- ceph_encode_32(&p, 8 + 4 + 4);
+ ceph_start_encoding(&p, 5, 4,
+ ceph_oloc_encoding_size(&req->r_t.target_oloc));
ceph_encode_64(&p, req->r_t.target_oloc.pool);
ceph_encode_32(&p, -1); /* preferred */
ceph_encode_32(&p, 0); /* key len */
+ if (req->r_t.target_oloc.pool_ns)
+ ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str,
+ req->r_t.target_oloc.pool_ns->len);
+ else
+ ceph_encode_32(&p, 0);
/* pgid */
ceph_encode_8(&p, 1);
@@ -2594,9 +2608,22 @@ static int ceph_oloc_decode(void **p, void *end,
}
if (struct_v >= 5) {
+ bool changed = false;
+
len = ceph_decode_32(p);
if (len > 0) {
- pr_warn("ceph_object_locator::nspace is set\n");
+ ceph_decode_need(p, end, len, e_inval);
+ if (!oloc->pool_ns ||
+ ceph_compare_string(oloc->pool_ns, *p, len))
+ changed = true;
+ *p += len;
+ } else {
+ if (oloc->pool_ns)
+ changed = true;
+ }
+ if (changed) {
+ /* redirect changes namespace */
+ pr_warn("ceph_object_locator::nspace is changed\n");
goto e_inval;
}
}
@@ -2806,7 +2833,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
goto out_unlock_session;
}
+ m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
ret = decode_MOSDOpReply(msg, &m);
+ m.redirect.oloc.pool_ns = NULL;
if (ret) {
pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
req->r_tid, ret);
@@ -2835,7 +2864,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
unlink_request(osd, req);
mutex_unlock(&osd->lock);
- ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+ /*
+ * Not ceph_oloc_copy() - changing pool_ns is not
+ * supported.
+ */
+ req->r_t.target_oloc.pool = m.redirect.oloc.pool;
req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
req->r_tid = 0;
__submit_request(req, false);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7e480bf75..d2436880b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1510,6 +1510,24 @@ bad:
return ERR_PTR(err);
}
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src)
+{
+ WARN_ON(!ceph_oloc_empty(dest));
+ WARN_ON(dest->pool_ns); /* empty() only covers ->pool */
+
+ dest->pool = src->pool;
+ if (src->pool_ns)
+ dest->pool_ns = ceph_get_string(src->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_copy);
+
+void ceph_oloc_destroy(struct ceph_object_locator *oloc)
+{
+ ceph_put_string(oloc->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_destroy);
+
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src)
{
@@ -1770,9 +1788,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 *ono,
u64 *oxoff, u64 *oxlen)
{
- u32 osize = le32_to_cpu(layout->fl_object_size);
- u32 su = le32_to_cpu(layout->fl_stripe_unit);
- u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ u32 osize = layout->object_size;
+ u32 su = layout->stripe_unit;
+ u32 sc = layout->stripe_count;
u32 bl, stripeno, stripepos, objsetno;
u32 su_per_object;
u64 t, su_offset;
@@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
if (!pi)
return -ENOENT;
- raw_pgid->pool = oloc->pool;
- raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
- oid->name_len);
-
- dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
- raw_pgid->pool, raw_pgid->seed);
+ if (!oloc->pool_ns) {
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
+ dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+ raw_pgid->pool, raw_pgid->seed);
+ } else {
+ char stack_buf[256];
+ char *buf = stack_buf;
+ int nsl = oloc->pool_ns->len;
+ size_t total = nsl + 1 + oid->name_len;
+
+ if (total > sizeof(stack_buf)) {
+ buf = kmalloc(total, GFP_NOIO);
+ if (!buf)
+ return -ENOMEM;
+ }
+ memcpy(buf, oloc->pool_ns->str, nsl);
+ buf[nsl] = '\037';
+ memcpy(buf + nsl + 1, oid->name, oid->name_len);
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
+ if (buf != stack_buf)
+ kfree(buf);
+ dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
+ oid->name, nsl, oloc->pool_ns->str,
+ raw_pgid->pool, raw_pgid->seed);
+ }
return 0;
}
EXPORT_SYMBOL(ceph_object_locator_to_pg);
diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
new file mode 100644
index 000000000..22fb96efc
--- /dev/null
+++ b/net/ceph/string_table.c
@@ -0,0 +1,105 @@
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/ceph/string_table.h>
+
+static DEFINE_SPINLOCK(string_tree_lock);
+static struct rb_root string_tree = RB_ROOT;
+
+struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
+{
+ struct ceph_string *cs, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&string_tree_lock);
+ p = &string_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist && !kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ exist = NULL;
+ }
+ spin_unlock(&string_tree_lock);
+ if (exist)
+ return exist;
+
+ cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
+ if (!cs)
+ return NULL;
+
+ kref_init(&cs->kref);
+ cs->len = len;
+ memcpy(cs->str, str, len);
+ cs->str[len] = 0;
+
+retry:
+ exist = NULL;
+ parent = NULL;
+ p = &string_tree.rb_node;
+ spin_lock(&string_tree_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ ret = 0;
+ if (!exist) {
+ rb_link_node(&cs->node, parent, p);
+ rb_insert_color(&cs->node, &string_tree);
+ } else if (!kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ ret = -EAGAIN;
+ }
+ spin_unlock(&string_tree_lock);
+ if (ret == -EAGAIN)
+ goto retry;
+
+ if (exist) {
+ kfree(cs);
+ cs = exist;
+ }
+
+ return cs;
+}
+EXPORT_SYMBOL(ceph_find_or_create_string);
+
+void ceph_release_string(struct kref *ref)
+{
+ struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
+
+ spin_lock(&string_tree_lock);
+ if (!RB_EMPTY_NODE(&cs->node)) {
+ rb_erase(&cs->node, &string_tree);
+ RB_CLEAR_NODE(&cs->node);
+ }
+ spin_unlock(&string_tree_lock);
+
+ kfree_rcu(cs, rcu);
+}
+EXPORT_SYMBOL(ceph_release_string);
+
+bool ceph_strings_empty(void)
+{
+ return RB_EMPTY_ROOT(&string_tree);
+}
diff --git a/net/core/dev.c b/net/core/dev.c
index 97fb3da50..ea6312057 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -94,6 +94,7 @@
#include <linux/ethtool.h>
#include <linux/notifier.h>
#include <linux/skbuff.h>
+#include <linux/bpf.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/busy_poll.h>
@@ -139,6 +140,7 @@
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
#include <linux/sctp.h>
+#include <linux/crash_dump.h>
#include "net-sysfs.h"
@@ -196,7 +198,7 @@ static inline void dev_base_seq_inc(struct net *net)
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
- unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+ unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}
@@ -2249,11 +2251,12 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues);
*/
int netif_get_num_default_rss_queues(void)
{
- return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+ return is_kdump_kernel() ?
+ 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);
-static inline void __netif_reschedule(struct Qdisc *q)
+static void __netif_reschedule(struct Qdisc *q)
{
struct softnet_data *sd;
unsigned long flags;
@@ -2420,7 +2423,7 @@ EXPORT_SYMBOL(__skb_tx_hash);
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
- static const netdev_features_t null_features = 0;
+ static const netdev_features_t null_features;
struct net_device *dev = skb->dev;
const char *name = "";
@@ -3068,6 +3071,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
+ struct sk_buff *to_free = NULL;
bool contended;
int rc;
@@ -3075,7 +3079,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
- * This permits __QDISC___STATE_RUNNING owner to get the lock more
+ * This permits qdisc->running owner to get the lock more
* often and dequeue packets faster.
*/
contended = qdisc_is_running(q);
@@ -3084,7 +3088,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
spin_lock(root_lock);
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
- kfree_skb(skb);
+ __qdisc_drop(skb, &to_free);
rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
qdisc_run_begin(q)) {
@@ -3107,7 +3111,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
- rc = q->enqueue(skb, q) & NET_XMIT_MASK;
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -3117,6 +3121,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
}
}
spin_unlock(root_lock);
+ if (unlikely(to_free))
+ kfree_skb_list(to_free);
if (unlikely(contended))
spin_unlock(&q->busylock);
return rc;
@@ -3142,8 +3148,6 @@ static void skb_update_prio(struct sk_buff *skb)
DEFINE_PER_CPU(int, xmit_recursion);
EXPORT_SYMBOL(xmit_recursion);
-#define RECURSION_LIMIT 10
-
/**
* dev_loopback_xmit - loop back @skb
* @net: network namespace this loopback is happening in
@@ -3386,8 +3390,8 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
int cpu = smp_processor_id(); /* ok because BHs are off */
if (txq->xmit_lock_owner != cpu) {
-
- if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+ if (unlikely(__this_cpu_read(xmit_recursion) >
+ XMIT_RECURSION_LIMIT))
goto recursion_alert;
skb = validate_xmit_skb(skb, dev);
@@ -3898,22 +3902,14 @@ static void net_tx_action(struct softirq_action *h)
head = head->next_sched;
root_lock = qdisc_lock(q);
- if (spin_trylock(root_lock)) {
- smp_mb__before_atomic();
- clear_bit(__QDISC_STATE_SCHED,
- &q->state);
- qdisc_run(q);
- spin_unlock(root_lock);
- } else {
- if (!test_bit(__QDISC_STATE_DEACTIVATED,
- &q->state)) {
- __netif_reschedule(q);
- } else {
- smp_mb__before_atomic();
- clear_bit(__QDISC_STATE_SCHED,
- &q->state);
- }
- }
+ spin_lock(root_lock);
+ /* We need to make sure head->next_sched is read
+ * before clearing __QDISC_STATE_SCHED
+ */
+ smp_mb__before_atomic();
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
+ qdisc_run(q);
+ spin_unlock(root_lock);
}
}
}
@@ -4993,7 +4989,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
rc = napi->poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi);
+ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
if (rc == BUSY_POLL_BUDGET) {
napi_complete_done(napi, rc);
napi_schedule(napi);
@@ -5149,7 +5145,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
- trace_napi_poll(n);
+ trace_napi_poll(n, work, weight);
}
WARN_ON_ONCE(work > weight);
@@ -5466,6 +5462,52 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
EXPORT_SYMBOL(netdev_lower_get_next);
/**
+ * netdev_all_lower_get_next - Get the next device from all lower neighbour list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's all lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour all lower
+ * list will remain unchanged.
+ */
+struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->all_adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_all_lower_get_next);
+
+/**
+ * netdev_all_lower_get_next_rcu - Get the next device from all
+ * lower neighbour list, RCU variant
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's all lower neighbour
+ * list, starting from iter position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_first_or_null_rcu(&dev->all_adj_list.lower,
+ struct netdev_adjacent, list);
+
+ return lower ? lower->dev : NULL;
+}
+EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
+
+/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
* lower neighbour list, RCU
* variant
@@ -5935,7 +5977,7 @@ static void netdev_adjacent_add_links(struct net_device *dev)
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.lower);
@@ -5944,7 +5986,7 @@ static void netdev_adjacent_add_links(struct net_device *dev)
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.upper);
@@ -5960,7 +6002,7 @@ static void netdev_adjacent_del_links(struct net_device *dev)
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, dev->name,
&iter->dev->adj_list.lower);
@@ -5969,7 +6011,7 @@ static void netdev_adjacent_del_links(struct net_device *dev)
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, dev->name,
&iter->dev->adj_list.upper);
@@ -5985,7 +6027,7 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.lower);
@@ -5994,7 +6036,7 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.upper);
@@ -6019,8 +6061,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
EXPORT_SYMBOL(netdev_lower_dev_get_private);
-int dev_get_nest_level(struct net_device *dev,
- bool (*type_check)(const struct net_device *dev))
+int dev_get_nest_level(struct net_device *dev)
{
struct net_device *lower = NULL;
struct list_head *iter;
@@ -6030,15 +6071,12 @@ int dev_get_nest_level(struct net_device *dev,
ASSERT_RTNL();
netdev_for_each_lower_dev(dev, lower, iter) {
- nest = dev_get_nest_level(lower, type_check);
+ nest = dev_get_nest_level(lower);
if (max_nest < nest)
max_nest = nest;
}
- if (type_check(dev))
- max_nest++;
-
- return max_nest;
+ return max_nest + 1;
}
EXPORT_SYMBOL(dev_get_nest_level);
@@ -6062,6 +6100,50 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
}
EXPORT_SYMBOL(netdev_lower_state_changed);
+int netdev_default_l2upper_neigh_construct(struct net_device *dev,
+ struct neighbour *n)
+{
+ struct net_device *lower_dev, *stop_dev;
+ struct list_head *iter;
+ int err;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ if (!lower_dev->netdev_ops->ndo_neigh_construct)
+ continue;
+ err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
+ if (err) {
+ stop_dev = lower_dev;
+ goto rollback;
+ }
+ }
+ return 0;
+
+rollback:
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ if (lower_dev == stop_dev)
+ break;
+ if (!lower_dev->netdev_ops->ndo_neigh_destroy)
+ continue;
+ lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
+
+void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
+ struct neighbour *n)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ if (!lower_dev->netdev_ops->ndo_neigh_destroy)
+ continue;
+ lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
+ }
+}
+EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
+
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -6546,6 +6628,38 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
EXPORT_SYMBOL(dev_change_proto_down);
/**
+ * dev_change_xdp_fd - set or clear a bpf program for a device rx path
+ * @dev: device
+ * @fd: new program fd or negative value to clear
+ *
+ * Set or clear a bpf program for a device
+ */
+int dev_change_xdp_fd(struct net_device *dev, int fd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct bpf_prog *prog = NULL;
+ struct netdev_xdp xdp = {};
+ int err;
+
+ if (!ops->ndo_xdp)
+ return -EOPNOTSUPP;
+ if (fd >= 0) {
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+
+ xdp.command = XDP_SETUP_PROG;
+ xdp.prog = prog;
+ err = ops->ndo_xdp(dev, &xdp);
+ if (err < 0 && prog)
+ bpf_prog_put(prog);
+
+ return err;
+}
+EXPORT_SYMBOL(dev_change_xdp_fd);
+
+/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 933e8d4d3..1b5063088 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -26,6 +26,10 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/devlink.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/devlink.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
static LIST_HEAD(devlink_list);
@@ -1394,6 +1398,78 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
return -EOPNOTSUPP;
}
+static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags, u16 mode)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ struct sk_buff *msg;
+ u16 mode;
+ int err;
+
+ if (!ops || !ops->eswitch_mode_get)
+ return -EOPNOTSUPP;
+
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
+ info->snd_portid, info->snd_seq, 0, mode);
+
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ u16 mode;
+
+ if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
+ return -EINVAL;
+
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+
+ if (ops && ops->eswitch_mode_set)
+ return ops->eswitch_mode_set(devlink, mode);
+ return -EOPNOTSUPP;
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -1407,6 +1483,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -1525,6 +1602,20 @@ static const struct genl_ops devlink_nl_ops[] = {
DEVLINK_NL_FLAG_NEED_SB |
DEVLINK_NL_FLAG_LOCK_PORTS,
},
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
+ .doit = devlink_nl_cmd_eswitch_mode_get_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
+ .doit = devlink_nl_cmd_eswitch_mode_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
/**
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 252e155c8..d6b3b5795 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -187,7 +187,8 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *locatio
trace_drop_common(skb, location);
}
-static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
+ int work, int budget)
{
struct dm_hw_stat_delta *new_stat;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f4034817d..977489820 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -89,6 +89,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
[NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
+ [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 840acebbb..be4629c34 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -173,7 +173,8 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
EXPORT_SYMBOL_GPL(fib_rules_unregister);
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
- struct flowi *fl, int flags)
+ struct flowi *fl, int flags,
+ struct fib_lookup_arg *arg)
{
int ret = 0;
@@ -189,6 +190,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
goto out;
+ if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -204,7 +208,7 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
list_for_each_entry_rcu(rule, &ops->rules_list, list) {
jumped:
- if (!fib_rule_match(rule, ops, fl, flags))
+ if (!fib_rule_match(rule, ops, fl, flags, arg))
continue;
if (rule->action == FR_ACT_GOTO) {
@@ -265,7 +269,50 @@ errout:
return err;
}
-static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
+static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
+ struct nlattr **tb, struct fib_rule *rule)
+{
+ struct fib_rule *r;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action != rule->action)
+ continue;
+
+ if (r->table != rule->table)
+ continue;
+
+ if (r->pref != rule->pref)
+ continue;
+
+ if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ continue;
+
+ if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ continue;
+
+ if (r->mark != rule->mark)
+ continue;
+
+ if (r->mark_mask != rule->mark_mask)
+ continue;
+
+ if (r->tun_id != rule->tun_id)
+ continue;
+
+ if (r->fr_net != rule->fr_net)
+ continue;
+
+ if (r->l3mdev != rule->l3mdev)
+ continue;
+
+ if (!ops->compare(r, frh, tb))
+ continue;
+ return 1;
+ }
+ return 0;
+}
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
@@ -336,6 +383,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
if (tb[FRA_TUN_ID])
rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+ if (tb[FRA_L3MDEV]) {
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
+ if (rule->l3mdev != 1)
+#endif
+ goto errout_free;
+ }
+
rule->action = frh->action;
rule->flags = frh->flags;
rule->table = frh_get_table(frh, tb);
@@ -371,6 +426,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
} else if (rule->action == FR_ACT_GOTO)
goto errout_free;
+ if (rule->l3mdev && rule->table)
+ goto errout_free;
+
+ if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
+ rule_exists(ops, frh, tb, rule)) {
+ err = -EEXIST;
+ goto errout_free;
+ }
+
err = ops->configure(rule, skb, frh, tb);
if (err < 0)
goto errout_free;
@@ -424,8 +488,9 @@ errout:
rules_ops_put(ops);
return err;
}
+EXPORT_SYMBOL_GPL(fib_nl_newrule);
-static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
+int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
@@ -483,6 +548,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
(rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
continue;
+ if (tb[FRA_L3MDEV] &&
+ (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -536,6 +605,7 @@ errout:
rules_ops_put(ops);
return err;
}
+EXPORT_SYMBOL_GPL(fib_nl_delrule);
static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
struct fib_rule *rule)
@@ -607,7 +677,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->target &&
nla_put_u32(skb, FRA_GOTO, rule->target)) ||
(rule->tun_id &&
- nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)))
+ nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
+ (rule->l3mdev &&
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index bca32d63a..cb06aceb5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -150,6 +150,12 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return raw_smp_processor_id();
}
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+ .func = __get_raw_cpu_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
struct bpf_insn *insn_buf)
{
@@ -748,6 +754,17 @@ static bool chk_code_allowed(u16 code_to_probe)
return codes[code_to_probe];
}
+static bool bpf_check_basics_ok(const struct sock_filter *filter,
+ unsigned int flen)
+{
+ if (filter == NULL)
+ return false;
+ if (flen == 0 || flen > BPF_MAXINSNS)
+ return false;
+
+ return true;
+}
+
/**
* bpf_check_classic - verify socket filter code
* @filter: filter to verify
@@ -768,9 +785,6 @@ static int bpf_check_classic(const struct sock_filter *filter,
bool anc_found;
int pc;
- if (flen == 0 || flen > BPF_MAXINSNS)
- return -EINVAL;
-
/* Check the filter code now */
for (pc = 0; pc < flen; pc++) {
const struct sock_filter *ftest = &filter[pc];
@@ -1065,7 +1079,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
struct bpf_prog *fp;
/* Make sure new filter is there and in the right amounts. */
- if (fprog->filter == NULL)
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
return -EINVAL;
fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
@@ -1112,7 +1126,7 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
int err;
/* Make sure new filter is there and in the right amounts. */
- if (fprog->filter == NULL)
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
return -EINVAL;
fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
@@ -1207,7 +1221,6 @@ static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
unsigned int fsize = bpf_classic_proglen(fprog);
- unsigned int bpf_fsize = bpf_prog_size(fprog->len);
struct bpf_prog *prog;
int err;
@@ -1215,10 +1228,10 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
return ERR_PTR(-EPERM);
/* Make sure new filter is there and in the right amounts. */
- if (fprog->filter == NULL)
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
return ERR_PTR(-EINVAL);
- prog = bpf_prog_alloc(bpf_fsize, 0);
+ prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
if (!prog)
return ERR_PTR(-ENOMEM);
@@ -1288,21 +1301,10 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
{
- struct bpf_prog *prog;
-
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return ERR_PTR(-EPERM);
- prog = bpf_prog_get(ufd);
- if (IS_ERR(prog))
- return prog;
-
- if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
- bpf_prog_put(prog);
- return ERR_PTR(-EINVAL);
- }
-
- return prog;
+ return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
}
int sk_attach_bpf(u32 ufd, struct sock *sk)
@@ -1359,6 +1361,18 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
return err;
}
+static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
+{
+ if (skb_at_tc_ingress(skb))
+ skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
+}
+
+static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
+{
+ if (skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
+}
+
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
@@ -1376,12 +1390,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
ptr = skb->data + offset;
if (flags & BPF_F_RECOMPUTE_CSUM)
- skb_postpull_rcsum(skb, ptr, len);
+ __skb_postpull_rcsum(skb, ptr, len, offset);
memcpy(ptr, from, len);
if (flags & BPF_F_RECOMPUTE_CSUM)
- skb_postpush_rcsum(skb, ptr, len);
+ __skb_postpush_rcsum(skb, ptr, len, offset);
if (flags & BPF_F_INVALIDATE_HASH)
skb_clear_hash(skb);
@@ -1569,9 +1583,33 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.arg5_type = ARG_ANYTHING,
};
+static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return dev_forward_skb(dev, skb);
+}
+
+static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ int ret;
+
+ if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ kfree_skb(skb);
+ return -ENETDOWN;
+ }
+
+ skb->dev = dev;
+
+ __this_cpu_inc(xmit_recursion);
+ ret = dev_queue_xmit(skb);
+ __this_cpu_dec(xmit_recursion);
+
+ return ret;
+}
+
static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
struct net_device *dev;
if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1581,19 +1619,14 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
if (unlikely(!dev))
return -EINVAL;
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!skb2))
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
return -ENOMEM;
- if (flags & BPF_F_INGRESS) {
- if (skb_at_tc_ingress(skb2))
- skb_postpush_rcsum(skb2, skb_mac_header(skb2),
- skb2->mac_len);
- return dev_forward_skb(dev, skb2);
- }
+ bpf_push_mac_rcsum(skb);
- skb2->dev = dev;
- return dev_queue_xmit(skb2);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}
static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -1637,15 +1670,10 @@ int skb_do_redirect(struct sk_buff *skb)
return -EINVAL;
}
- if (ri->flags & BPF_F_INGRESS) {
- if (skb_at_tc_ingress(skb))
- skb_postpush_rcsum(skb, skb_mac_header(skb),
- skb->mac_len);
- return dev_forward_skb(dev, skb);
- }
+ bpf_push_mac_rcsum(skb);
- skb->dev = dev;
- return dev_queue_xmit(skb);
+ return ri->flags & BPF_F_INGRESS ?
+ __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}
static const struct bpf_func_proto bpf_redirect_proto = {
@@ -1680,6 +1708,23 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* If skb_clear_hash() was called due to mangling, we can
+ * trigger SW recalculation here. Later access to hash
+ * can then use the inline skb->hash via context directly
+ * instead of calling this helper again.
+ */
+ return skb_get_hash((struct sk_buff *) (unsigned long) r1);
+}
+
+static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
+ .func = bpf_get_hash_recalc,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
@@ -1690,7 +1735,10 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
vlan_proto != htons(ETH_P_8021AD)))
vlan_proto = htons(ETH_P_8021Q);
+ bpf_push_mac_rcsum(skb);
ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
+ bpf_pull_mac_rcsum(skb);
+
bpf_compute_data_end(skb);
return ret;
}
@@ -1710,7 +1758,10 @@ static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
struct sk_buff *skb = (struct sk_buff *) (long) r1;
int ret;
+ bpf_push_mac_rcsum(skb);
ret = skb_vlan_pop(skb);
+ bpf_pull_mac_rcsum(skb);
+
bpf_compute_data_end(skb);
return ret;
}
@@ -1723,6 +1774,224 @@ const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
};
EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
+static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
+{
+ /* Caller already did skb_cow() with len as headroom,
+ * so no need to do it here.
+ */
+ skb_push(skb, len);
+ memmove(skb->data, skb->data + len, off);
+ memset(skb->data + off, 0, len);
+
+ /* No skb_postpush_rcsum(skb, skb->data + off, len)
+ * needed here as it does not change the skb->csum
+ * result for checksum complete when summing over
+ * zeroed blocks.
+ */
+ return 0;
+}
+
+static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
+{
+ /* skb_ensure_writable() is not needed here, as we're
+ * already working on an uncloned skb.
+ */
+ if (unlikely(!pskb_may_pull(skb, off + len)))
+ return -ENOMEM;
+
+ skb_postpull_rcsum(skb, skb->data + off, len);
+ memmove(skb->data + len, skb->data, off);
+ __skb_pull(skb, len);
+
+ return 0;
+}
+
+static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ int ret;
+
+ /* There's no need for __skb_push()/__skb_pull() pair to
+ * get to the start of the mac header as we're guaranteed
+ * to always start from here under eBPF.
+ */
+ ret = bpf_skb_generic_push(skb, off, len);
+ if (likely(!ret)) {
+ skb->mac_header -= len;
+ skb->network_header -= len;
+ if (trans_same)
+ skb->transport_header = skb->network_header;
+ }
+
+ return ret;
+}
+
+static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ int ret;
+
+ /* Same here, __skb_push()/__skb_pull() pair not needed. */
+ ret = bpf_skb_generic_pop(skb, off, len);
+ if (likely(!ret)) {
+ skb->mac_header += len;
+ skb->network_header += len;
+ if (trans_same)
+ skb->transport_header = skb->network_header;
+ }
+
+ return ret;
+}
+
+static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
+{
+ const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ u32 off = skb->network_header - skb->mac_header;
+ int ret;
+
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
+ * be changed into SKB_GSO_TCPV6.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+ skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+ }
+
+ /* Due to IPv6 header, MSS needs to be downgraded. */
+ skb_shinfo(skb)->gso_size -= len_diff;
+ /* Header must be checked, and gso_segs recomputed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
+{
+ const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ u32 off = skb->network_header - skb->mac_header;
+ int ret;
+
+ ret = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
+ * be changed into SKB_GSO_TCPV4.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
+ skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ }
+
+ /* Due to IPv4 header, MSS can be upgraded. */
+ skb_shinfo(skb)->gso_size += len_diff;
+ /* Header must be checked, and gso_segs recomputed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
+{
+ __be16 from_proto = skb->protocol;
+
+ if (from_proto == htons(ETH_P_IP) &&
+ to_proto == htons(ETH_P_IPV6))
+ return bpf_skb_proto_4_to_6(skb);
+
+ if (from_proto == htons(ETH_P_IPV6) &&
+ to_proto == htons(ETH_P_IP))
+ return bpf_skb_proto_6_to_4(skb);
+
+ return -ENOTSUPP;
+}
+
+static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ __be16 proto = (__force __be16) r2;
+ int ret;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ /* General idea is that this helper does the basic groundwork
+ * needed for changing the protocol, and eBPF program fills the
+ * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
+ * and other helpers, rather than passing a raw buffer here.
+ *
+ * The rationale is to keep this minimal and without a need to
+ * deal with raw packet data. F.e. even if we would pass buffers
+ * here, the program still needs to call the bpf_lX_csum_replace()
+ * helpers anyway. Plus, this way we keep also separation of
+ * concerns, since f.e. bpf_skb_store_bytes() should only take
+ * care of stores.
+ *
+ * Currently, additional options and extension header space are
+ * not supported, but flags register is reserved so we can adapt
+ * that. For offloads, we mark packet as dodgy, so that headers
+ * need to be verified first.
+ */
+ ret = bpf_skb_proto_xlat(skb, proto);
+ bpf_compute_data_end(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_proto_proto = {
+ .func = bpf_skb_change_proto,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u32 pkt_type = r2;
+
+ /* We only allow a restricted subset to be changed for now. */
+ if (unlikely(skb->pkt_type > PACKET_OTHERHOST ||
+ pkt_type > PACKET_OTHERHOST))
+ return -EINVAL;
+
+ skb->pkt_type = pkt_type;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_type_proto = {
+ .func = bpf_skb_change_type,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
bool bpf_helper_changes_skb_data(void *func)
{
if (func == bpf_skb_vlan_push)
@@ -1731,6 +2000,8 @@ bool bpf_helper_changes_skb_data(void *func)
return true;
if (func == bpf_skb_store_bytes)
return true;
+ if (func == bpf_skb_change_proto)
+ return true;
if (func == bpf_l3_csum_replace)
return true;
if (func == bpf_l4_csum_replace)
@@ -1739,6 +2010,47 @@ bool bpf_helper_changes_skb_data(void *func)
return false;
}
+static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
+ unsigned long off, unsigned long len)
+{
+ void *ptr = skb_header_pointer(skb, off, len, dst_buff);
+
+ if (unlikely(!ptr))
+ return len;
+ if (ptr != dst_buff)
+ memcpy(dst_buff, ptr, len);
+
+ return 0;
+}
+
+static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
+ u64 meta_size)
+{
+ struct sk_buff *skb = (struct sk_buff *)(long) r1;
+ struct bpf_map *map = (struct bpf_map *)(long) r2;
+ u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+ void *meta = (void *)(long) r4;
+
+ if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (unlikely(skb_size > skb->len))
+ return -EFAULT;
+
+ return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
+ bpf_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_skb_event_output_proto = {
+ .func = bpf_skb_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_STACK,
+ .arg5_type = ARG_CONST_STACK_SIZE,
+};
+
static unsigned short bpf_tunnel_key_af(u64 flags)
{
return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
@@ -1970,6 +2282,40 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
}
}
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(long)r1;
+ struct bpf_map *map = (struct bpf_map *)(long)r2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+ struct sock *sk;
+ u32 i = (u32)r3;
+
+ sk = skb->sk;
+ if (!sk || !sk_fullsock(sk))
+ return -ENOENT;
+
+ if (unlikely(i >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[i]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
+}
+
+static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
+ .func = bpf_skb_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+#endif
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
@@ -1983,7 +2329,7 @@ sk_filter_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_smp_processor_id_proto;
+ return &bpf_get_raw_smp_processor_id_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_ktime_get_ns:
@@ -2018,6 +2364,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_vlan_push_proto;
case BPF_FUNC_skb_vlan_pop:
return &bpf_skb_vlan_pop_proto;
+ case BPF_FUNC_skb_change_proto:
+ return &bpf_skb_change_proto_proto;
+ case BPF_FUNC_skb_change_type:
+ return &bpf_skb_change_type_proto;
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
@@ -2030,13 +2380,27 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_redirect_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
case BPF_FUNC_perf_event_output:
- return bpf_get_event_output_proto();
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+#endif
default:
return sk_filter_func_proto(func_id);
}
}
+static const struct bpf_func_proto *
+xdp_func_proto(enum bpf_func_id func_id)
+{
+ return sk_filter_func_proto(func_id);
+}
+
static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2104,6 +2468,44 @@ static bool tc_cls_act_is_valid_access(int off, int size,
return __is_valid_access(off, size, type);
}
+static bool __is_valid_xdp_access(int off, int size,
+ enum bpf_access_type type)
+{
+ if (off < 0 || off >= sizeof(struct xdp_md))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (size != 4)
+ return false;
+
+ return true;
+}
+
+static bool xdp_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case offsetof(struct xdp_md, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct xdp_md, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_xdp_access(off, size, type);
+}
+
+void bpf_warn_invalid_xdp_action(u32 act)
+{
+ WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+
static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
@@ -2255,6 +2657,29 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
return insn - insn_buf;
}
+static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct xdp_md, data):
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
+ dst_reg, src_reg,
+ offsetof(struct xdp_buff, data));
+ break;
+ case offsetof(struct xdp_md, data_end):
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
+ dst_reg, src_reg,
+ offsetof(struct xdp_buff, data_end));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
@@ -2267,6 +2692,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
.convert_ctx_access = bpf_net_convert_ctx_access,
};
+static const struct bpf_verifier_ops xdp_ops = {
+ .get_func_proto = xdp_func_proto,
+ .is_valid_access = xdp_is_valid_access,
+ .convert_ctx_access = xdp_convert_ctx_access,
+};
+
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2282,11 +2713,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
.type = BPF_PROG_TYPE_SCHED_ACT,
};
+static struct bpf_prog_type_list xdp_type __read_mostly = {
+ .ops = &xdp_ops,
+ .type = BPF_PROG_TYPE_XDP,
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
+ bpf_register_prog_type(&xdp_type);
return 0;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 61ad43f61..52742a028 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -680,11 +680,13 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
void __skb_get_hash(struct sk_buff *skb)
{
struct flow_keys keys;
+ u32 hash;
__flow_hash_secret_init();
- __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd),
- flow_keys_have_l4(&keys));
+ hash = ___skb_get_hash(skb, &keys, hashrnd);
+
+ __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
EXPORT_SYMBOL(__skb_get_hash);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 4573d8109..cad8e791f 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -84,6 +84,7 @@ struct gen_estimator
struct gnet_stats_basic_packed *bstats;
struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
+ seqcount_t *running;
int ewma_log;
u32 last_packets;
unsigned long avpps;
@@ -121,26 +122,28 @@ static void est_timer(unsigned long arg)
unsigned long rate;
u64 brate;
- spin_lock(e->stats_lock);
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
read_lock(&est_lock);
if (e->bstats == NULL)
goto skip;
- __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
+ __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
brate = (b.bytes - e->last_bytes)<<(7 - idx);
e->last_bytes = b.bytes;
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
- e->rate_est->bps = (e->avbps+0xF)>>5;
+ WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
rate = b.packets - e->last_packets;
rate <<= (7 - idx);
e->last_packets = b.packets;
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- e->rate_est->pps = (e->avpps + 0xF) >> 5;
+ WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
skip:
read_unlock(&est_lock);
- spin_unlock(e->stats_lock);
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
}
if (!list_empty(&elist[idx].list))
@@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
+ * @running: qdisc running seqcount
* @opt: rate estimator configuration TLV
*
* Creates a new rate estimator with &bstats as source and &rate_est
@@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock,
+ seqcount_t *running,
struct nlattr *opt)
{
struct gen_estimator *est;
@@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
if (est == NULL)
return -ENOBUFS;
- __gnet_stats_copy_basic(&b, cpu_bstats, bstats);
+ __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
idx = parm->interval + 2;
est->bstats = bstats;
est->rate_est = rate_est;
est->stats_lock = stats_lock;
+ est->running = running;
est->ewma_log = parm->ewma_log;
est->last_bytes = b.bytes;
est->avbps = rate_est->bps<<5;
@@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
+ * @running: qdisc running seqcount (might be NULL)
* @opt: rate estimator configuration TLV
*
* Replaces the configuration of a rate estimator by calling
@@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
- spinlock_t *stats_lock, struct nlattr *opt)
+ spinlock_t *stats_lock,
+ seqcount_t *running, struct nlattr *opt)
{
gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index be873e4e3..508e05130 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
return 0;
nla_put_failure:
+ if (d->lock)
+ spin_unlock_bh(d->lock);
kfree(d->xstats);
d->xstats = NULL;
d->xstats_len = 0;
- spin_unlock_bh(d->lock);
return -1;
}
@@ -66,15 +67,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
{
memset(d, 0, sizeof(*d));
- spin_lock_bh(lock);
- d->lock = lock;
if (type)
d->tail = (struct nlattr *)skb_tail_pointer(skb);
d->skb = skb;
d->compat_tc_stats = tc_stats_type;
d->compat_xstats = xstats_type;
d->padattr = padattr;
-
+ if (lock) {
+ d->lock = lock;
+ spin_lock_bh(lock);
+ }
if (d->tail)
return gnet_stats_copy(d, type, NULL, 0, padattr);
@@ -128,21 +130,29 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
}
void
-__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+__gnet_stats_copy_basic(const seqcount_t *running,
+ struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
+ unsigned int seq;
+
if (cpu) {
__gnet_stats_copy_basic_cpu(bstats, cpu);
- } else {
+ return;
+ }
+ do {
+ if (running)
+ seq = read_seqcount_begin(running);
bstats->bytes = b->bytes;
bstats->packets = b->packets;
- }
+ } while (running && read_seqcount_retry(running, seq));
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
+ * @running: seqcount_t pointer
* @d: dumping handle
* @cpu: copy statistic per cpu
* @b: basic statistics
@@ -154,13 +164,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(struct gnet_dump *d,
+gnet_stats_copy_basic(const seqcount_t *running,
+ struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
struct gnet_stats_basic_packed bstats = {0};
- __gnet_stats_copy_basic(&bstats, cpu, b);
+ __gnet_stats_copy_basic(running, &bstats, cpu, b);
if (d->compat_tc_stats) {
d->tc_stats.bytes = bstats.bytes;
@@ -330,8 +341,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
return 0;
err_out:
+ if (d->lock)
+ spin_unlock_bh(d->lock);
d->xstats_len = 0;
- spin_unlock_bh(d->lock);
return -1;
}
EXPORT_SYMBOL(gnet_stats_copy_app);
@@ -365,10 +377,11 @@ gnet_stats_finish_copy(struct gnet_dump *d)
return -1;
}
+ if (d->lock)
+ spin_unlock_bh(d->lock);
kfree(d->xstats);
d->xstats = NULL;
d->xstats_len = 0;
- spin_unlock_bh(d->lock);
return 0;
}
EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 510cd62fc..cf26e04c4 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -473,7 +473,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
}
if (dev->netdev_ops->ndo_neigh_construct) {
- error = dev->netdev_ops->ndo_neigh_construct(n);
+ error = dev->netdev_ops->ndo_neigh_construct(dev, n);
if (error < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
@@ -701,7 +701,7 @@ void neigh_destroy(struct neighbour *neigh)
neigh->arp_queue_len_bytes = 0;
if (dev->netdev_ops->ndo_neigh_destroy)
- dev->netdev_ops->ndo_neigh_destroy(neigh);
+ dev->netdev_ops->ndo_neigh_destroy(dev, neigh);
dev_put(dev);
neigh_parms_put(neigh->parms);
@@ -1060,8 +1060,6 @@ static void neigh_update_hhs(struct neighbour *neigh)
NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
lladdr instead of overriding it
if it is different.
- It also allows to retain current state
- if lladdr is unchanged.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
@@ -1150,10 +1148,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
} else
goto out;
} else {
- if (lladdr == neigh->ha && new == NUD_STALE &&
- ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
- (old & NUD_CONNECTED))
- )
+ if (lladdr == neigh->ha && new == NUD_STALE)
new = old;
}
}
@@ -2047,6 +2042,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
case NDTPA_DELAY_PROBE_TIME:
NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
nla_get_msecs(tbp[i]));
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
break;
case NDTPA_RETRANS_TIME:
NEIGH_VAR_SET(p, RETRANS_TIME,
@@ -2930,6 +2926,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
return;
set_bit(index, p->data_state);
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
if (!dev) /* NULL dev means this is default value */
neigh_copy_dflt_parms(net, p, index);
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7a0b61655..6e4f34721 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
- dev->tx_queue_len = new_len;
+ int res, orig_len = dev->tx_queue_len;
+
+ if (new_len != orig_len) {
+ dev->tx_queue_len = new_len;
+ res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ res = notifier_to_errno(res);
+ if (res) {
+ netdev_err(dev,
+ "refused to change device tx_queue_len\n");
+ dev->tx_queue_len = orig_len;
+ return -EFAULT;
+ }
+ }
+
return 0;
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 94acfc89a..53599bd0c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -163,7 +163,7 @@ static void poll_one_napi(struct napi_struct *napi)
*/
work = napi->poll(napi, 0);
WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll);
- trace_napi_poll(napi);
+ trace_napi_poll(napi, work, 0);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8b02df0d3..bbd118b19 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -213,6 +213,7 @@
/* Xmit modes */
#define M_START_XMIT 0 /* Default normal TX */
#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
/* If lock -- protects updating of if_list */
#define if_lock(t) spin_lock(&(t->if_lock));
@@ -626,6 +627,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->xmit_mode == M_NETIF_RECEIVE)
seq_puts(seq, " xmit_mode: netif_receive\n");
+ else if (pkt_dev->xmit_mode == M_QUEUE_XMIT)
+ seq_puts(seq, " xmit_mode: xmit_queue\n");
seq_puts(seq, " Flags: ");
@@ -1142,8 +1145,10 @@ static ssize_t pktgen_if_write(struct file *file,
return len;
i += len;
- if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) &&
- (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ if ((value > 1) &&
+ ((pkt_dev->xmit_mode == M_QUEUE_XMIT) ||
+ ((pkt_dev->xmit_mode == M_START_XMIT) &&
+ (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
return -ENOTSUPP;
pkt_dev->burst = value < 1 ? 1 : value;
sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
@@ -1198,6 +1203,9 @@ static ssize_t pktgen_if_write(struct file *file,
* at module loading time
*/
pkt_dev->clone_skb = 0;
+ } else if (strcmp(f, "queue_xmit") == 0) {
+ pkt_dev->xmit_mode = M_QUEUE_XMIT;
+ pkt_dev->last_ok = 1;
} else {
sprintf(pg_result,
"xmit_mode -:%s:- unknown\nAvailable modes: %s",
@@ -3434,6 +3442,36 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
#endif
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
+ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
+ local_bh_disable();
+ atomic_inc(&pkt_dev->skb->users);
+
+ ret = dev_queue_xmit(pkt_dev->skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ break;
+ case NET_XMIT_DROP:
+ case NET_XMIT_CN:
+ /* These are all valid return codes for a qdisc but
+ * indicate packets are being dropped or will likely
+ * be dropped soon.
+ */
+ case NETDEV_TX_BUSY:
+ /* qdisc may call dev_hard_start_xmit directly in cases
+ * where no queues exist e.g. loopback device, virtual
+ * devices, etc. In this case we need to handle
+ * NETDEV_TX_ codes.
+ */
+ default:
+ pkt_dev->errors++;
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
+ break;
+ }
+ goto out;
}
txq = skb_get_tx_queue(odev, pkt_dev->skb);
@@ -3463,7 +3501,6 @@ xmit_more:
break;
case NET_XMIT_DROP:
case NET_XMIT_CN:
- case NET_XMIT_POLICED:
/* skb has been consumed */
pkt_dev->errors++;
break;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d69c4644f..189cc78c7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -71,9 +71,31 @@ void rtnl_lock(void)
}
EXPORT_SYMBOL(rtnl_lock);
+static struct sk_buff *defer_kfree_skb_list;
+void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
+{
+ if (head && tail) {
+ tail->next = defer_kfree_skb_list;
+ defer_kfree_skb_list = head;
+ }
+}
+EXPORT_SYMBOL(rtnl_kfree_skbs);
+
void __rtnl_unlock(void)
{
+ struct sk_buff *head = defer_kfree_skb_list;
+
+ defer_kfree_skb_list = NULL;
+
mutex_unlock(&rtnl_mutex);
+
+ while (head) {
+ struct sk_buff *next = head->next;
+
+ kfree_skb(head);
+ cond_resched();
+ head = next;
+ }
}
void rtnl_unlock(void)
@@ -869,6 +891,16 @@ static size_t rtnl_port_size(const struct net_device *dev,
return port_self_size;
}
+static size_t rtnl_xdp_size(const struct net_device *dev)
+{
+ size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */
+
+ if (!dev->netdev_ops->ndo_xdp)
+ return 0;
+ else
+ return xdp_size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -905,6 +937,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+ + rtnl_xdp_size(dev) /* IFLA_XDP */
+ nla_total_size(1); /* IFLA_PROTO_DOWN */
}
@@ -1189,6 +1222,33 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct netdev_xdp xdp_op = {};
+ struct nlattr *xdp;
+ int err;
+
+ if (!dev->netdev_ops->ndo_xdp)
+ return 0;
+ xdp = nla_nest_start(skb, IFLA_XDP);
+ if (!xdp)
+ return -EMSGSIZE;
+ xdp_op.command = XDP_QUERY_PROG;
+ err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
+ if (err)
+ goto err_cancel;
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached);
+ if (err)
+ goto err_cancel;
+
+ nla_nest_end(skb, xdp);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, xdp);
+ return err;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, u32 ext_filter_mask)
@@ -1285,6 +1345,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
if (rtnl_port_fill(skb, dev, ext_filter_mask))
goto nla_put_failure;
+ if (rtnl_xdp_fill(skb, dev))
+ goto nla_put_failure;
+
if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
if (rtnl_link_fill(skb, dev) < 0)
goto nla_put_failure;
@@ -1370,6 +1433,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
[IFLA_PROTO_DOWN] = { .type = NLA_U8 },
+ [IFLA_XDP] = { .type = NLA_NESTED },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1407,6 +1471,11 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
};
+static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+};
+
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
{
const struct rtnl_link_ops *ops = NULL;
@@ -1905,11 +1974,19 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_TXQLEN]) {
unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
-
- if (dev->tx_queue_len ^ value)
+ unsigned long orig_len = dev->tx_queue_len;
+
+ if (dev->tx_queue_len ^ value) {
+ dev->tx_queue_len = value;
+ err = call_netdevice_notifiers(
+ NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ err = notifier_to_errno(err);
+ if (err) {
+ dev->tx_queue_len = orig_len;
+ goto errout;
+ }
status |= DO_SETLINK_NOTIFY;
-
- dev->tx_queue_len = value;
+ }
}
if (tb[IFLA_OPERSTATE])
@@ -2024,6 +2101,27 @@ static int do_setlink(const struct sk_buff *skb,
status |= DO_SETLINK_NOTIFY;
}
+ if (tb[IFLA_XDP]) {
+ struct nlattr *xdp[IFLA_XDP_MAX + 1];
+
+ err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
+ ifla_xdp_policy);
+ if (err < 0)
+ goto errout;
+
+ if (xdp[IFLA_XDP_ATTACHED]) {
+ err = -EINVAL;
+ goto errout;
+ }
+ if (xdp[IFLA_XDP_FD]) {
+ err = dev_change_xdp_fd(dev,
+ nla_get_s32(xdp[IFLA_XDP_FD]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+
errout:
if (status & DO_SETLINK_MODIFIED) {
if (status & DO_SETLINK_NOTIFY)
@@ -3497,7 +3595,32 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (!attr)
goto nla_put_failure;
- err = ops->fill_linkxstats(skb, dev, prividx);
+ err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
+ nla_nest_end(skb, attr);
+ if (err)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE,
+ *idxattr)) {
+ const struct rtnl_link_ops *ops = NULL;
+ const struct net_device *master;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (master)
+ ops = master->rtnl_link_ops;
+ if (ops && ops->fill_linkxstats) {
+ int err;
+
+ *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
+ attr = nla_nest_start(skb,
+ IFLA_STATS_LINK_XSTATS_SLAVE);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
nla_nest_end(skb, attr);
if (err)
goto nla_put_failure;
@@ -3533,14 +3656,35 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ int attr = IFLA_STATS_LINK_XSTATS;
if (ops && ops->get_linkxstats_size) {
- size += nla_total_size(ops->get_linkxstats_size(dev));
+ size += nla_total_size(ops->get_linkxstats_size(dev,
+ attr));
/* for IFLA_STATS_LINK_XSTATS */
size += nla_total_size(0);
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) {
+ struct net_device *_dev = (struct net_device *)dev;
+ const struct rtnl_link_ops *ops = NULL;
+ const struct net_device *master;
+
+ /* netdev_master_upper_dev_get can't take const */
+ master = netdev_master_upper_dev_get(_dev);
+ if (master)
+ ops = master->rtnl_link_ops;
+ if (ops && ops->get_linkxstats_size) {
+ int attr = IFLA_STATS_LINK_XSTATS_SLAVE;
+
+ size += nla_total_size(ops->get_linkxstats_size(dev,
+ attr));
+ /* for IFLA_STATS_LINK_XSTATS_SLAVE */
+ size += nla_total_size(0);
+ }
+ }
+
return size;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index eb12d2161..3864b4b68 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -49,6 +49,7 @@
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/udp.h>
+#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
@@ -3098,9 +3099,13 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
int hsize;
int size;
- len = head_skb->len - offset;
- if (len > mss)
- len = mss;
+ if (unlikely(mss == GSO_BY_FRAGS)) {
+ len = list_skb->len;
+ } else {
+ len = head_skb->len - offset;
+ if (len > mss)
+ len = mss;
+ }
hsize = skb_headlen(head_skb) - offset;
if (hsize < 0)
@@ -3420,6 +3425,7 @@ done:
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
}
+EXPORT_SYMBOL_GPL(skb_gro_receive);
void __init skb_init(void)
{
@@ -4360,6 +4366,8 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
thlen += inner_tcp_hdrlen(skb);
} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
thlen = tcp_hdrlen(skb);
+ } else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) {
+ thlen = sizeof(struct sctphdr);
}
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
@@ -4369,6 +4377,38 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
+/**
+ * skb_gso_validate_mtu - Return in case such skb fits a given MTU
+ *
+ * @skb: GSO skb
+ * @mtu: MTU to validate against
+ *
+ * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU
+ * once split.
+ */
+bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ const struct sk_buff *iter;
+ unsigned int hlen;
+
+ hlen = skb_gso_network_seglen(skb);
+
+ if (shinfo->gso_size != GSO_BY_FRAGS)
+ return hlen <= mtu;
+
+ /* Undo this so we can re-use header sizes */
+ hlen -= GSO_BY_FRAGS;
+
+ skb_walk_frags(skb, iter) {
+ if (hlen + skb_headlen(iter) > mtu)
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_mtu);
+
static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
if (skb_cow(skb, skb_headroom(skb)) < 0) {
diff --git a/net/core/utils.c b/net/core/utils.c
index 3d17ca8b4..cf5622b9c 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -133,7 +133,7 @@ int in4_pton(const char *src, int srclen,
s = src;
d = dbuf;
i = 0;
- while(1) {
+ while (1) {
int c;
c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
@@ -283,11 +283,11 @@ cont:
i = 15; d--;
if (dc) {
- while(d >= dc)
+ while (d >= dc)
dst[i--] = *d--;
- while(i >= dc - dbuf)
+ while (i >= dc - dbuf)
dst[i--] = 0;
- while(i >= 0)
+ while (i >= 0)
dst[i--] = *d--;
} else
memcpy(dst, dbuf, sizeof(dbuf));
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 3ff137d94..3828f94b2 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -216,14 +216,17 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
skb = dccp_make_response(sk, dst, req);
if (skb != NULL) {
struct dccp_hdr *dh = dccp_hdr(skb);
+ struct ipv6_txoptions *opt;
dh->dccph_checksum = dccp_v6_csum_finish(skb,
&ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);
fl6.daddr = ireq->ir_v6_rmt_addr;
rcu_read_lock();
- err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
- np->tclass);
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
+ err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -236,6 +239,7 @@ done:
static void dccp_v6_reqsk_destructor(struct request_sock *req)
{
dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+ kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts);
}
@@ -494,7 +498,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
* Yes, keeping reference count would be much more clever, but we make
* one more one thing there: reattach optmem to newsk.
*/
- opt = rcu_dereference(np->opt);
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
if (opt) {
opt = ipv6_dup_options(newsk, opt);
RCU_INIT_POINTER(newnp->opt, opt);
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index da06ed1df..8af4ded70 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,6 @@
# the core
obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o slave.o
+dsa_core-y += dsa.o slave.o dsa2.o
# tagging formats
dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index eff5dfc2e..7e68bc6bc 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -29,6 +29,33 @@
char dsa_driver_version[] = "0.1";
+static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Just return the original SKB */
+ return skb;
+}
+
+static const struct dsa_device_ops none_ops = {
+ .xmit = dsa_slave_notag_xmit,
+ .rcv = NULL,
+};
+
+const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
+#ifdef CONFIG_NET_DSA_TAG_DSA
+ [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops,
+#endif
+#ifdef CONFIG_NET_DSA_TAG_EDSA
+ [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops,
+#endif
+#ifdef CONFIG_NET_DSA_TAG_TRAILER
+ [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops,
+#endif
+#ifdef CONFIG_NET_DSA_TAG_BRCM
+ [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
+#endif
+ [DSA_TAG_PROTO_NONE] = &none_ops,
+};
/* switch driver registration ***********************************************/
static DEFINE_MUTEX(dsa_switch_drivers_mutex);
@@ -180,41 +207,100 @@ __ATTRIBUTE_GROUPS(dsa_hwmon);
#endif /* CONFIG_NET_DSA_HWMON */
/* basic switch operations **************************************************/
-static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master)
+int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
+ struct device_node *port_dn, int port)
{
- struct dsa_chip_data *cd = ds->cd;
- struct device_node *port_dn;
struct phy_device *phydev;
- int ret, port, mode;
+ int ret, mode;
+
+ if (of_phy_is_fixed_link(port_dn)) {
+ ret = of_phy_register_fixed_link(port_dn);
+ if (ret) {
+ dev_err(dev, "failed to register fixed PHY\n");
+ return ret;
+ }
+ phydev = of_phy_find_device(port_dn);
+
+ mode = of_get_phy_mode(port_dn);
+ if (mode < 0)
+ mode = PHY_INTERFACE_MODE_NA;
+ phydev->interface = mode;
+
+ genphy_config_init(phydev);
+ genphy_read_status(phydev);
+ if (ds->drv->adjust_link)
+ ds->drv->adjust_link(ds, port, phydev);
+ }
+
+ return 0;
+}
+
+static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev)
+{
+ struct device_node *port_dn;
+ int ret, port;
for (port = 0; port < DSA_MAX_PORTS; port++) {
if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
continue;
- port_dn = cd->port_dn[port];
- if (of_phy_is_fixed_link(port_dn)) {
- ret = of_phy_register_fixed_link(port_dn);
- if (ret) {
- netdev_err(master,
- "failed to register fixed PHY\n");
- return ret;
- }
- phydev = of_phy_find_device(port_dn);
+ port_dn = ds->ports[port].dn;
+ ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
- mode = of_get_phy_mode(port_dn);
- if (mode < 0)
- mode = PHY_INTERFACE_MODE_NA;
- phydev->interface = mode;
+const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol)
+{
+ const struct dsa_device_ops *ops;
+
+ if (tag_protocol >= DSA_TAG_LAST)
+ return ERR_PTR(-EINVAL);
+ ops = dsa_device_ops[tag_protocol];
+
+ if (!ops)
+ return ERR_PTR(-ENOPROTOOPT);
+
+ return ops;
+}
+
+int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds)
+{
+ struct net_device *master;
+ struct ethtool_ops *cpu_ops;
+
+ master = ds->dst->master_netdev;
+ if (ds->master_netdev)
+ master = ds->master_netdev;
+
+ cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL);
+ if (!cpu_ops)
+ return -ENOMEM;
+
+ memcpy(&ds->dst->master_ethtool_ops, master->ethtool_ops,
+ sizeof(struct ethtool_ops));
+ ds->dst->master_orig_ethtool_ops = master->ethtool_ops;
+ memcpy(cpu_ops, &ds->dst->master_ethtool_ops,
+ sizeof(struct ethtool_ops));
+ dsa_cpu_port_ethtool_init(cpu_ops);
+ master->ethtool_ops = cpu_ops;
- genphy_config_init(phydev);
- genphy_read_status(phydev);
- if (ds->drv->adjust_link)
- ds->drv->adjust_link(ds, port, phydev);
- }
- }
return 0;
}
+void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
+{
+ struct net_device *master;
+
+ master = ds->dst->master_netdev;
+ if (ds->master_netdev)
+ master = ds->master_netdev;
+
+ master->ethtool_ops = ds->dst->master_orig_ethtool_ops;
+}
+
static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
{
struct dsa_switch_driver *drv = ds->drv;
@@ -243,6 +329,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
}
dst->cpu_switch = index;
dst->cpu_port = i;
+ ds->cpu_port_mask |= 1 << i;
} else if (!strcmp(name, "dsa")) {
ds->dsa_port_mask |= 1 << i;
} else {
@@ -267,37 +354,17 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
* switch.
*/
if (dst->cpu_switch == index) {
- switch (drv->tag_protocol) {
-#ifdef CONFIG_NET_DSA_TAG_DSA
- case DSA_TAG_PROTO_DSA:
- dst->rcv = dsa_netdev_ops.rcv;
- break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_EDSA
- case DSA_TAG_PROTO_EDSA:
- dst->rcv = edsa_netdev_ops.rcv;
- break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_TRAILER
- case DSA_TAG_PROTO_TRAILER:
- dst->rcv = trailer_netdev_ops.rcv;
- break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_BRCM
- case DSA_TAG_PROTO_BRCM:
- dst->rcv = brcm_netdev_ops.rcv;
- break;
-#endif
- case DSA_TAG_PROTO_NONE:
- break;
- default:
- ret = -ENOPROTOOPT;
+ dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol);
+ if (IS_ERR(dst->tag_ops)) {
+ ret = PTR_ERR(dst->tag_ops);
goto out;
}
- dst->tag_protocol = drv->tag_protocol;
+ dst->rcv = dst->tag_ops->rcv;
}
+ memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable));
+
/*
* Do basic register setup.
*/
@@ -309,22 +376,25 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
if (ret < 0)
goto out;
- ds->slave_mii_bus = devm_mdiobus_alloc(parent);
- if (ds->slave_mii_bus == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- dsa_slave_mii_bus_init(ds);
-
- ret = mdiobus_register(ds->slave_mii_bus);
- if (ret < 0)
- goto out;
+ if (!ds->slave_mii_bus && drv->phy_read) {
+ ds->slave_mii_bus = devm_mdiobus_alloc(parent);
+ if (!ds->slave_mii_bus) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dsa_slave_mii_bus_init(ds);
+ ret = mdiobus_register(ds->slave_mii_bus);
+ if (ret < 0)
+ goto out;
+ }
/*
* Create network devices for physical switch ports.
*/
for (i = 0; i < DSA_MAX_PORTS; i++) {
+ ds->ports[i].dn = cd->port_dn[i];
+
if (!(ds->enabled_port_mask & (1 << i)))
continue;
@@ -337,13 +407,17 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
}
/* Perform configuration of the CPU and DSA ports */
- ret = dsa_cpu_dsa_setup(ds, dst->master_netdev);
+ ret = dsa_cpu_dsa_setups(ds, parent);
if (ret < 0) {
netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n",
index);
ret = 0;
}
+ ret = dsa_cpu_port_ethtool_setup(ds);
+ if (ret)
+ return ret;
+
#ifdef CONFIG_NET_DSA_HWMON
/* If the switch provides a temperature sensor,
* register with hardware monitoring subsystem.
@@ -420,11 +494,21 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
return ds;
}
-static void dsa_switch_destroy(struct dsa_switch *ds)
+void dsa_cpu_dsa_destroy(struct device_node *port_dn)
{
- struct device_node *port_dn;
struct phy_device *phydev;
- struct dsa_chip_data *cd = ds->cd;
+
+ if (of_phy_is_fixed_link(port_dn)) {
+ phydev = of_phy_find_device(port_dn);
+ if (phydev) {
+ phy_device_free(phydev);
+ fixed_phy_unregister(phydev);
+ }
+ }
+}
+
+static void dsa_switch_destroy(struct dsa_switch *ds)
+{
int port;
#ifdef CONFIG_NET_DSA_HWMON
@@ -437,26 +521,25 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
if (!(ds->enabled_port_mask & (1 << port)))
continue;
- if (!ds->ports[port])
+ if (!ds->ports[port].netdev)
continue;
- dsa_slave_destroy(ds->ports[port]);
+ dsa_slave_destroy(ds->ports[port].netdev);
}
- /* Remove any fixed link PHYs */
+ /* Disable configuration of the CPU and DSA ports */
for (port = 0; port < DSA_MAX_PORTS; port++) {
- port_dn = cd->port_dn[port];
- if (of_phy_is_fixed_link(port_dn)) {
- phydev = of_phy_find_device(port_dn);
- if (phydev) {
- phy_device_free(phydev);
- of_node_put(port_dn);
- fixed_phy_unregister(phydev);
- }
- }
+ if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
+ continue;
+ dsa_cpu_dsa_destroy(ds->ports[port].dn);
+
+ /* Clearing a bit which is not set does no harm */
+ ds->cpu_port_mask |= ~(1 << port);
+ ds->dsa_port_mask |= ~(1 << port);
}
- mdiobus_unregister(ds->slave_mii_bus);
+ if (ds->slave_mii_bus && ds->drv->phy_read)
+ mdiobus_unregister(ds->slave_mii_bus);
}
#ifdef CONFIG_PM_SLEEP
@@ -469,7 +552,7 @@ static int dsa_switch_suspend(struct dsa_switch *ds)
if (!dsa_is_port_initialized(ds, i))
continue;
- ret = dsa_slave_suspend(ds->ports[i]);
+ ret = dsa_slave_suspend(ds->ports[i].netdev);
if (ret)
return ret;
}
@@ -495,7 +578,7 @@ static int dsa_switch_resume(struct dsa_switch *ds)
if (!dsa_is_port_initialized(ds, i))
continue;
- ret = dsa_slave_resume(ds->ports[i]);
+ ret = dsa_slave_resume(ds->ports[i].netdev);
if (ret)
return ret;
}
@@ -587,17 +670,6 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
if (link_sw_addr >= pd->nr_chips)
return -EINVAL;
- /* First time routing table allocation */
- if (!cd->rtable) {
- cd->rtable = kmalloc_array(pd->nr_chips, sizeof(s8),
- GFP_KERNEL);
- if (!cd->rtable)
- return -ENOMEM;
-
- /* default to no valid uplink/downlink */
- memset(cd->rtable, -1, pd->nr_chips * sizeof(s8));
- }
-
cd->rtable[link_sw_addr] = port_index;
return 0;
@@ -639,7 +711,6 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
kfree(pd->chip[i].port_names[port_index]);
port_index++;
}
- kfree(pd->chip[i].rtable);
/* Drop our reference to the MDIO bus device */
if (pd->chip[i].host_dev)
@@ -703,11 +774,17 @@ static int dsa_of_probe(struct device *dev)
chip_index = -1;
for_each_available_child_of_node(np, child) {
+ int i;
+
chip_index++;
cd = &pd->chip[chip_index];
cd->of_node = child;
+ /* Initialize the routing table */
+ for (i = 0; i < DSA_MAX_SWITCHES; ++i)
+ cd->rtable[i] = DSA_RTABLE_NONE;
+
/* When assigning the host device, increment its refcount */
cd->host_dev = get_device(&mdio_bus->dev);
@@ -931,6 +1008,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
dsa_switch_destroy(ds);
}
+ dsa_cpu_port_ethtool_restore(dst->ds[0]);
+
dev_put(dst->master_netdev);
}
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
new file mode 100644
index 000000000..f30bad967
--- /dev/null
+++ b/net/dsa/dsa2.c
@@ -0,0 +1,695 @@
+/*
+ * net/dsa/dsa2.c - Hardware switch handling, binding version 2
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <net/dsa.h>
+#include <linux/of.h>
+#include <linux/of_net.h>
+#include "dsa_priv.h"
+
+static LIST_HEAD(dsa_switch_trees);
+static DEFINE_MUTEX(dsa2_mutex);
+
+static struct dsa_switch_tree *dsa_get_dst(u32 tree)
+{
+ struct dsa_switch_tree *dst;
+
+ list_for_each_entry(dst, &dsa_switch_trees, list)
+ if (dst->tree == tree)
+ return dst;
+ return NULL;
+}
+
+static void dsa_free_dst(struct kref *ref)
+{
+ struct dsa_switch_tree *dst = container_of(ref, struct dsa_switch_tree,
+ refcount);
+
+ list_del(&dst->list);
+ kfree(dst);
+}
+
+static void dsa_put_dst(struct dsa_switch_tree *dst)
+{
+ kref_put(&dst->refcount, dsa_free_dst);
+}
+
+static struct dsa_switch_tree *dsa_add_dst(u32 tree)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ if (!dst)
+ return NULL;
+ dst->tree = tree;
+ dst->cpu_switch = -1;
+ INIT_LIST_HEAD(&dst->list);
+ list_add_tail(&dsa_switch_trees, &dst->list);
+ kref_init(&dst->refcount);
+
+ return dst;
+}
+
+static void dsa_dst_add_ds(struct dsa_switch_tree *dst,
+ struct dsa_switch *ds, u32 index)
+{
+ kref_get(&dst->refcount);
+ dst->ds[index] = ds;
+}
+
+static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
+ struct dsa_switch *ds, u32 index)
+{
+ dst->ds[index] = NULL;
+ kref_put(&dst->refcount, dsa_free_dst);
+}
+
+static bool dsa_port_is_dsa(struct device_node *port)
+{
+ const char *name;
+
+ name = of_get_property(port, "label", NULL);
+ if (!name)
+ return false;
+
+ if (!strcmp(name, "dsa"))
+ return true;
+
+ return false;
+}
+
+static bool dsa_port_is_cpu(struct device_node *port)
+{
+ const char *name;
+
+ name = of_get_property(port, "label", NULL);
+ if (!name)
+ return false;
+
+ if (!strcmp(name, "cpu"))
+ return true;
+
+ return false;
+}
+
+static bool dsa_ds_find_port(struct dsa_switch *ds,
+ struct device_node *port)
+{
+ u32 index;
+
+ for (index = 0; index < DSA_MAX_PORTS; index++)
+ if (ds->ports[index].dn == port)
+ return true;
+ return false;
+}
+
+static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
+ struct device_node *port)
+{
+ struct dsa_switch *ds;
+ u32 index;
+
+ for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+ ds = dst->ds[index];
+ if (!ds)
+ continue;
+
+ if (dsa_ds_find_port(ds, port))
+ return ds;
+ }
+
+ return NULL;
+}
+
+static int dsa_port_complete(struct dsa_switch_tree *dst,
+ struct dsa_switch *src_ds,
+ struct device_node *port,
+ u32 src_port)
+{
+ struct device_node *link;
+ int index;
+ struct dsa_switch *dst_ds;
+
+ for (index = 0;; index++) {
+ link = of_parse_phandle(port, "link", index);
+ if (!link)
+ break;
+
+ dst_ds = dsa_dst_find_port(dst, link);
+ of_node_put(link);
+
+ if (!dst_ds)
+ return 1;
+
+ src_ds->rtable[dst_ds->index] = src_port;
+ }
+
+ return 0;
+}
+
+/* A switch is complete if all the DSA ports phandles point to ports
+ * known in the tree. A return value of 1 means the tree is not
+ * complete. This is not an error condition. A value of 0 is
+ * success.
+ */
+static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
+{
+ struct device_node *port;
+ u32 index;
+ int err;
+
+ for (index = 0; index < DSA_MAX_PORTS; index++) {
+ port = ds->ports[index].dn;
+ if (!port)
+ continue;
+
+ if (!dsa_port_is_dsa(port))
+ continue;
+
+ err = dsa_port_complete(dst, ds, port, index);
+ if (err != 0)
+ return err;
+
+ ds->dsa_port_mask |= BIT(index);
+ }
+
+ return 0;
+}
+
+/* A tree is complete if all the DSA ports phandles point to ports
+ * known in the tree. A return value of 1 means the tree is not
+ * complete. This is not an error condition. A value of 0 is
+ * success.
+ */
+static int dsa_dst_complete(struct dsa_switch_tree *dst)
+{
+ struct dsa_switch *ds;
+ u32 index;
+ int err;
+
+ for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+ ds = dst->ds[index];
+ if (!ds)
+ continue;
+
+ err = dsa_ds_complete(dst, ds);
+ if (err != 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static int dsa_dsa_port_apply(struct device_node *port, u32 index,
+ struct dsa_switch *ds)
+{
+ int err;
+
+ err = dsa_cpu_dsa_setup(ds, ds->dev, port, index);
+ if (err) {
+ dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n",
+ index, err);
+ return err;
+ }
+
+ return 0;
+}
+
+static void dsa_dsa_port_unapply(struct device_node *port, u32 index,
+ struct dsa_switch *ds)
+{
+ dsa_cpu_dsa_destroy(port);
+}
+
+static int dsa_cpu_port_apply(struct device_node *port, u32 index,
+ struct dsa_switch *ds)
+{
+ int err;
+
+ err = dsa_cpu_dsa_setup(ds, ds->dev, port, index);
+ if (err) {
+ dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n",
+ index, err);
+ return err;
+ }
+
+ ds->cpu_port_mask |= BIT(index);
+
+ return 0;
+}
+
+static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
+ struct dsa_switch *ds)
+{
+ dsa_cpu_dsa_destroy(port);
+ ds->cpu_port_mask &= ~BIT(index);
+
+}
+
+static int dsa_user_port_apply(struct device_node *port, u32 index,
+ struct dsa_switch *ds)
+{
+ const char *name;
+ int err;
+
+ name = of_get_property(port, "label", NULL);
+
+ err = dsa_slave_create(ds, ds->dev, index, name);
+ if (err) {
+ dev_warn(ds->dev, "Failed to create slave %d: %d\n",
+ index, err);
+ return err;
+ }
+
+ return 0;
+}
+
+static void dsa_user_port_unapply(struct device_node *port, u32 index,
+ struct dsa_switch *ds)
+{
+ if (ds->ports[index].netdev) {
+ dsa_slave_destroy(ds->ports[index].netdev);
+ ds->ports[index].netdev = NULL;
+ ds->enabled_port_mask &= ~(1 << index);
+ }
+}
+
+static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
+{
+ struct device_node *port;
+ u32 index;
+ int err;
+
+ /* Initialize ds->phys_mii_mask before registering the slave MDIO bus
+ * driver and before drv->setup() has run, since the switch drivers and
+ * the slave MDIO bus driver rely on these values for probing PHY
+ * devices or not
+ */
+ ds->phys_mii_mask = ds->enabled_port_mask;
+
+ err = ds->drv->setup(ds);
+ if (err < 0)
+ return err;
+
+ err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr);
+ if (err < 0)
+ return err;
+
+ err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr);
+ if (err < 0)
+ return err;
+
+ if (!ds->slave_mii_bus && ds->drv->phy_read) {
+ ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
+ if (!ds->slave_mii_bus)
+ return -ENOMEM;
+
+ dsa_slave_mii_bus_init(ds);
+
+ err = mdiobus_register(ds->slave_mii_bus);
+ if (err < 0)
+ return err;
+ }
+
+ for (index = 0; index < DSA_MAX_PORTS; index++) {
+ port = ds->ports[index].dn;
+ if (!port)
+ continue;
+
+ if (dsa_port_is_dsa(port)) {
+ err = dsa_dsa_port_apply(port, index, ds);
+ if (err)
+ return err;
+ continue;
+ }
+
+ if (dsa_port_is_cpu(port)) {
+ err = dsa_cpu_port_apply(port, index, ds);
+ if (err)
+ return err;
+ continue;
+ }
+
+ err = dsa_user_port_apply(port, index, ds);
+ if (err)
+ continue;
+ }
+
+ return 0;
+}
+
+static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
+{
+ struct device_node *port;
+ u32 index;
+
+ for (index = 0; index < DSA_MAX_PORTS; index++) {
+ port = ds->ports[index].dn;
+ if (!port)
+ continue;
+
+ if (dsa_port_is_dsa(port)) {
+ dsa_dsa_port_unapply(port, index, ds);
+ continue;
+ }
+
+ if (dsa_port_is_cpu(port)) {
+ dsa_cpu_port_unapply(port, index, ds);
+ continue;
+ }
+
+ dsa_user_port_unapply(port, index, ds);
+ }
+
+ if (ds->slave_mii_bus && ds->drv->phy_read)
+ mdiobus_unregister(ds->slave_mii_bus);
+}
+
+static int dsa_dst_apply(struct dsa_switch_tree *dst)
+{
+ struct dsa_switch *ds;
+ u32 index;
+ int err;
+
+ for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+ ds = dst->ds[index];
+ if (!ds)
+ continue;
+
+ err = dsa_ds_apply(dst, ds);
+ if (err)
+ return err;
+ }
+
+ err = dsa_cpu_port_ethtool_setup(dst->ds[0]);
+ if (err)
+ return err;
+
+ /* If we use a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point on get
+ * sent to the tag format's receive function.
+ */
+ wmb();
+ dst->master_netdev->dsa_ptr = (void *)dst;
+ dst->applied = true;
+
+ return 0;
+}
+
+static void dsa_dst_unapply(struct dsa_switch_tree *dst)
+{
+ struct dsa_switch *ds;
+ u32 index;
+
+ if (!dst->applied)
+ return;
+
+ dst->master_netdev->dsa_ptr = NULL;
+
+ /* If we used a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point get sent
+ * without the tag and go through the regular receive path.
+ */
+ wmb();
+
+ for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+ ds = dst->ds[index];
+ if (!ds)
+ continue;
+
+ dsa_ds_unapply(dst, ds);
+ }
+
+ dsa_cpu_port_ethtool_restore(dst->ds[0]);
+
+ pr_info("DSA: tree %d unapplied\n", dst->tree);
+ dst->applied = false;
+}
+
+static int dsa_cpu_parse(struct device_node *port, u32 index,
+ struct dsa_switch_tree *dst,
+ struct dsa_switch *ds)
+{
+ struct net_device *ethernet_dev;
+ struct device_node *ethernet;
+
+ ethernet = of_parse_phandle(port, "ethernet", 0);
+ if (!ethernet)
+ return -EINVAL;
+
+ ethernet_dev = of_find_net_device_by_node(ethernet);
+ if (!ethernet_dev)
+ return -EPROBE_DEFER;
+
+ if (!ds->master_netdev)
+ ds->master_netdev = ethernet_dev;
+
+ if (!dst->master_netdev)
+ dst->master_netdev = ethernet_dev;
+
+ if (dst->cpu_switch == -1) {
+ dst->cpu_switch = ds->index;
+ dst->cpu_port = index;
+ }
+
+ dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol);
+ if (IS_ERR(dst->tag_ops)) {
+ dev_warn(ds->dev, "No tagger for this switch\n");
+ return PTR_ERR(dst->tag_ops);
+ }
+
+ dst->rcv = dst->tag_ops->rcv;
+
+ return 0;
+}
+
+static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
+{
+ struct device_node *port;
+ u32 index;
+ int err;
+
+ for (index = 0; index < DSA_MAX_PORTS; index++) {
+ port = ds->ports[index].dn;
+ if (!port)
+ continue;
+
+ if (dsa_port_is_cpu(port)) {
+ err = dsa_cpu_parse(port, index, dst, ds);
+ if (err)
+ return err;
+ }
+ }
+
+ pr_info("DSA: switch %d %d parsed\n", dst->tree, ds->index);
+
+ return 0;
+}
+
+static int dsa_dst_parse(struct dsa_switch_tree *dst)
+{
+ struct dsa_switch *ds;
+ u32 index;
+ int err;
+
+ for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+ ds = dst->ds[index];
+ if (!ds)
+ continue;
+
+ err = dsa_ds_parse(dst, ds);
+ if (err)
+ return err;
+ }
+
+ if (!dst->master_netdev) {
+ pr_warn("Tree has no master device\n");
+ return -EINVAL;
+ }
+
+ pr_info("DSA: tree %d parsed\n", dst->tree);
+
+ return 0;
+}
+
+static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
+{
+ struct device_node *port;
+ int err;
+ u32 reg;
+
+ for_each_available_child_of_node(ports, port) {
+ err = of_property_read_u32(port, "reg", &reg);
+ if (err)
+ return err;
+
+ if (reg >= DSA_MAX_PORTS)
+ return -EINVAL;
+
+ ds->ports[reg].dn = port;
+
+ /* Initialize enabled_port_mask now for drv->setup()
+ * to have access to a correct value, just like what
+ * net/dsa/dsa.c::dsa_switch_setup_one does.
+ */
+ if (!dsa_port_is_cpu(port))
+ ds->enabled_port_mask |= 1 << reg;
+ }
+
+ return 0;
+}
+
+static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
+{
+ int err;
+
+ *tree = *index = 0;
+
+ err = of_property_read_u32_index(np, "dsa,member", 0, tree);
+ if (err) {
+ /* Does not exist, but it is optional */
+ if (err == -EINVAL)
+ return 0;
+ return err;
+ }
+
+ err = of_property_read_u32_index(np, "dsa,member", 1, index);
+ if (err)
+ return err;
+
+ if (*index >= DSA_MAX_SWITCHES)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct device_node *dsa_get_ports(struct dsa_switch *ds,
+ struct device_node *np)
+{
+ struct device_node *ports;
+
+ ports = of_get_child_by_name(np, "ports");
+ if (!ports) {
+ dev_err(ds->dev, "no ports child node found\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return ports;
+}
+
+static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+{
+ struct device_node *ports = dsa_get_ports(ds, np);
+ struct dsa_switch_tree *dst;
+ u32 tree, index;
+ int i, err;
+
+ err = dsa_parse_member(np, &tree, &index);
+ if (err)
+ return err;
+
+ if (IS_ERR(ports))
+ return PTR_ERR(ports);
+
+ err = dsa_parse_ports_dn(ports, ds);
+ if (err)
+ return err;
+
+ dst = dsa_get_dst(tree);
+ if (!dst) {
+ dst = dsa_add_dst(tree);
+ if (!dst)
+ return -ENOMEM;
+ }
+
+ if (dst->ds[index]) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ ds->dst = dst;
+ ds->index = index;
+
+ /* Initialize the routing table */
+ for (i = 0; i < DSA_MAX_SWITCHES; ++i)
+ ds->rtable[i] = DSA_RTABLE_NONE;
+
+ dsa_dst_add_ds(dst, ds, index);
+
+ err = dsa_dst_complete(dst);
+ if (err < 0)
+ goto out_del_dst;
+
+ if (err == 1) {
+ /* Not all switches registered yet */
+ err = 0;
+ goto out;
+ }
+
+ if (dst->applied) {
+ pr_info("DSA: Disjoint trees?\n");
+ return -EINVAL;
+ }
+
+ err = dsa_dst_parse(dst);
+ if (err)
+ goto out_del_dst;
+
+ err = dsa_dst_apply(dst);
+ if (err) {
+ dsa_dst_unapply(dst);
+ goto out_del_dst;
+ }
+
+ dsa_put_dst(dst);
+ return 0;
+
+out_del_dst:
+ dsa_dst_del_ds(dst, ds, ds->index);
+out:
+ dsa_put_dst(dst);
+
+ return err;
+}
+
+int dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+{
+ int err;
+
+ mutex_lock(&dsa2_mutex);
+ err = _dsa_register_switch(ds, np);
+ mutex_unlock(&dsa2_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(dsa_register_switch);
+
+static void _dsa_unregister_switch(struct dsa_switch *ds)
+{
+ struct dsa_switch_tree *dst = ds->dst;
+
+ dsa_dst_unapply(dst);
+
+ dsa_dst_del_ds(dst, ds, ds->index);
+}
+
+void dsa_unregister_switch(struct dsa_switch *ds)
+{
+ mutex_lock(&dsa2_mutex);
+ _dsa_unregister_switch(ds);
+ mutex_unlock(&dsa2_mutex);
+}
+EXPORT_SYMBOL_GPL(dsa_unregister_switch);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index dfa33779d..00077a9c9 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -50,12 +50,19 @@ struct dsa_slave_priv {
/* dsa.c */
extern char dsa_driver_version[];
+int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
+ struct device_node *port_dn, int port);
+void dsa_cpu_dsa_destroy(struct device_node *port_dn);
+const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
+int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
+void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
/* slave.c */
extern const struct dsa_device_ops notag_netdev_ops;
void dsa_slave_mii_bus_init(struct dsa_switch *ds);
+void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops);
int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
- int port, char *name);
+ int port, const char *name);
void dsa_slave_destroy(struct net_device *slave_dev);
int dsa_slave_suspend(struct net_device *slave_dev);
int dsa_slave_resume(struct net_device *slave_dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 152436cda..fc9196745 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -49,8 +49,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds)
ds->slave_mii_bus->name = "dsa slave smi";
ds->slave_mii_bus->read = dsa_slave_phy_read;
ds->slave_mii_bus->write = dsa_slave_phy_write;
- snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x",
- ds->index, ds->cd->sw_addr);
+ snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
+ ds->dst->tree, ds->index);
ds->slave_mii_bus->parent = ds->dev;
ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
}
@@ -333,6 +333,44 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
return 0;
}
+static int dsa_fastest_ageing_time(struct dsa_switch *ds,
+ unsigned int ageing_time)
+{
+ int i;
+
+ for (i = 0; i < DSA_MAX_PORTS; ++i) {
+ struct dsa_port *dp = &ds->ports[i];
+
+ if (dp && dp->ageing_time && dp->ageing_time < ageing_time)
+ ageing_time = dp->ageing_time;
+ }
+
+ return ageing_time;
+}
+
+static int dsa_slave_ageing_time(struct net_device *dev,
+ const struct switchdev_attr *attr,
+ struct switchdev_trans *trans)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time);
+ unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
+
+ /* bridge skips -EOPNOTSUPP, so skip the prepare phase */
+ if (switchdev_trans_ph_prepare(trans))
+ return 0;
+
+ /* Keep the fastest ageing time in case of multiple bridges */
+ ds->ports[p->port].ageing_time = ageing_time;
+ ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
+
+ if (ds->drv->set_ageing_time)
+ return ds->drv->set_ageing_time(ds, ageing_time);
+
+ return 0;
+}
+
static int dsa_slave_port_attr_set(struct net_device *dev,
const struct switchdev_attr *attr,
struct switchdev_trans *trans)
@@ -346,6 +384,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
ret = dsa_slave_vlan_filtering(dev, attr, trans);
break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+ ret = dsa_slave_ageing_time(dev, attr, trans);
+ break;
default:
ret = -EOPNOTSUPP;
break;
@@ -522,14 +563,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
-static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
- struct net_device *dev)
-{
- /* Just return the original SKB */
- return skb;
-}
-
-
/* ethtool operations *******************************************************/
static int
dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
@@ -615,7 +648,7 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev)
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->cd->eeprom_len)
+ if (ds->cd && ds->cd->eeprom_len)
return ds->cd->eeprom_len;
if (ds->drv->get_eeprom_len)
@@ -873,6 +906,13 @@ static void dsa_slave_poll_controller(struct net_device *dev)
}
#endif
+void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
+{
+ ops->get_sset_count = dsa_cpu_port_get_sset_count;
+ ops->get_ethtool_stats = dsa_cpu_port_get_ethtool_stats;
+ ops->get_strings = dsa_cpu_port_get_strings;
+}
+
static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_settings = dsa_slave_get_settings,
.set_settings = dsa_slave_set_settings,
@@ -893,8 +933,6 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_eee = dsa_slave_get_eee,
};
-static struct ethtool_ops dsa_cpu_port_ethtool_ops;
-
static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_open = dsa_slave_open,
.ndo_stop = dsa_slave_close,
@@ -999,13 +1037,12 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
struct net_device *slave_dev)
{
struct dsa_switch *ds = p->parent;
- struct dsa_chip_data *cd = ds->cd;
struct device_node *phy_dn, *port_dn;
bool phy_is_fixed = false;
u32 phy_flags = 0;
int mode, ret;
- port_dn = cd->port_dn[p->port];
+ port_dn = ds->ports[p->port].dn;
mode = of_get_phy_mode(port_dn);
if (mode < 0)
mode = PHY_INTERFACE_MODE_NA;
@@ -1109,14 +1146,18 @@ int dsa_slave_resume(struct net_device *slave_dev)
}
int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
- int port, char *name)
+ int port, const char *name)
{
- struct net_device *master = ds->dst->master_netdev;
struct dsa_switch_tree *dst = ds->dst;
+ struct net_device *master;
struct net_device *slave_dev;
struct dsa_slave_priv *p;
int ret;
+ master = ds->dst->master_netdev;
+ if (ds->master_netdev)
+ master = ds->master_netdev;
+
slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name,
NET_NAME_UNKNOWN, ether_setup);
if (slave_dev == NULL)
@@ -1124,19 +1165,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
slave_dev->features = master->vlan_features;
slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
- if (master->ethtool_ops != &dsa_cpu_port_ethtool_ops) {
- memcpy(&dst->master_ethtool_ops, master->ethtool_ops,
- sizeof(struct ethtool_ops));
- memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops,
- sizeof(struct ethtool_ops));
- dsa_cpu_port_ethtool_ops.get_sset_count =
- dsa_cpu_port_get_sset_count;
- dsa_cpu_port_ethtool_ops.get_ethtool_stats =
- dsa_cpu_port_get_ethtool_stats;
- dsa_cpu_port_ethtool_ops.get_strings =
- dsa_cpu_port_get_strings;
- master->ethtool_ops = &dsa_cpu_port_ethtool_ops;
- }
eth_hw_addr_inherit(slave_dev, master);
slave_dev->priv_flags |= IFF_NO_QUEUE;
slave_dev->netdev_ops = &dsa_slave_netdev_ops;
@@ -1147,49 +1175,24 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
NULL);
SET_NETDEV_DEV(slave_dev, parent);
- slave_dev->dev.of_node = ds->cd->port_dn[port];
+ slave_dev->dev.of_node = ds->ports[port].dn;
slave_dev->vlan_features = master->vlan_features;
p = netdev_priv(slave_dev);
p->parent = ds;
p->port = port;
-
- switch (ds->dst->tag_protocol) {
-#ifdef CONFIG_NET_DSA_TAG_DSA
- case DSA_TAG_PROTO_DSA:
- p->xmit = dsa_netdev_ops.xmit;
- break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_EDSA
- case DSA_TAG_PROTO_EDSA:
- p->xmit = edsa_netdev_ops.xmit;
- break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_TRAILER
- case DSA_TAG_PROTO_TRAILER:
- p->xmit = trailer_netdev_ops.xmit;
- break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_BRCM
- case DSA_TAG_PROTO_BRCM:
- p->xmit = brcm_netdev_ops.xmit;
- break;
-#endif
- default:
- p->xmit = dsa_slave_notag_xmit;
- break;
- }
+ p->xmit = dst->tag_ops->xmit;
p->old_pause = -1;
p->old_link = -1;
p->old_duplex = -1;
- ds->ports[port] = slave_dev;
+ ds->ports[port].netdev = slave_dev;
ret = register_netdev(slave_dev);
if (ret) {
netdev_err(master, "error %d registering interface %s\n",
ret, slave_dev->name);
- ds->ports[port] = NULL;
+ ds->ports[port].netdev = NULL;
free_netdev(slave_dev);
return ret;
}
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index e2aadb731..21bffde6e 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -127,7 +127,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
/* Validate port against switch setup, either the port is totally */
- if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+ if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
goto out_drop;
/* Remove Broadcom tag and update checksum */
@@ -140,7 +140,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
skb_push(skb, ETH_HLEN);
skb->pkt_type = PACKET_HOST;
- skb->dev = ds->ports[source_port];
+ skb->dev = ds->ports[source_port].netdev;
skb->protocol = eth_type_trans(skb, skb->dev);
skb->dev->stats.rx_packets++;
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index aa780e4ac..bce79ffe3 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -107,10 +107,14 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
* Check that the source device exists and that the source
* port is a registered DSA port.
*/
- if (source_device >= dst->pd->nr_chips)
+ if (source_device >= DSA_MAX_SWITCHES)
goto out_drop;
+
ds = dst->ds[source_device];
- if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+ if (!ds)
+ goto out_drop;
+
+ if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
goto out_drop;
/*
@@ -159,7 +163,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
2 * ETH_ALEN);
}
- skb->dev = ds->ports[source_port];
+ skb->dev = ds->ports[source_port].netdev;
skb_push(skb, ETH_HLEN);
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, skb->dev);
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 2288c8098..6c1720e88 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -120,10 +120,14 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
* Check that the source device exists and that the source
* port is a registered DSA port.
*/
- if (source_device >= dst->pd->nr_chips)
+ if (source_device >= DSA_MAX_SWITCHES)
goto out_drop;
+
ds = dst->ds[source_device];
- if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+ if (!ds)
+ goto out_drop;
+
+ if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
goto out_drop;
/*
@@ -178,7 +182,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
2 * ETH_ALEN);
}
- skb->dev = ds->ports[source_port];
+ skb->dev = ds->ports[source_port].netdev;
skb_push(skb, ETH_HLEN);
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, skb->dev);
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index b6ca0890d..5e3903eb1 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -82,12 +82,12 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
goto out_drop;
source_port = trailer[1] & 7;
- if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+ if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
goto out_drop;
pskb_trim_rcsum(skb, skb->len - 4);
- skb->dev = ds->ports[source_port];
+ skb->dev = ds->ports[source_port].netdev;
skb_push(skb, ETH_HLEN);
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, skb->dev);
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index dd085db85..d7efbf0da 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -58,21 +58,10 @@ static struct header_ops lowpan_header_ops = {
.create = lowpan_header_create,
};
-static struct lock_class_key lowpan_tx_busylock;
-static struct lock_class_key lowpan_netdev_xmit_lock_key;
-
-static void lowpan_set_lockdep_class_one(struct net_device *ldev,
- struct netdev_queue *txq,
- void *_unused)
-{
- lockdep_set_class(&txq->_xmit_lock,
- &lowpan_netdev_xmit_lock_key);
-}
-
static int lowpan_dev_init(struct net_device *ldev)
{
- netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL);
- ldev->qdisc_tx_busylock = &lowpan_tx_busylock;
+ netdev_lockdep_set_classes(ldev);
+
return 0;
}
@@ -92,11 +81,21 @@ static int lowpan_stop(struct net_device *dev)
return 0;
}
+static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n)
+{
+ struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+
+ /* default no short_addr is available for a neighbour */
+ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+ return 0;
+}
+
static const struct net_device_ops lowpan_netdev_ops = {
.ndo_init = lowpan_dev_init,
.ndo_start_xmit = lowpan_xmit,
.ndo_open = lowpan_open,
.ndo_stop = lowpan_stop,
+ .ndo_neigh_construct = lowpan_neigh_construct,
};
static void lowpan_setup(struct net_device *ldev)
@@ -131,8 +130,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
pr_debug("adding new link\n");
- if (!tb[IFLA_LINK] ||
- !net_eq(dev_net(ldev), &init_net))
+ if (!tb[IFLA_LINK])
return -EINVAL;
/* find and hold wpan device */
wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK]));
@@ -161,6 +159,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
wdev->needed_headroom;
ldev->needed_tailroom = wdev->needed_tailroom;
+ ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh);
+
ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154);
if (ret < 0) {
dev_put(wdev);
diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c
index ef185dd41..649e7d45e 100644
--- a/net/ieee802154/6lowpan/rx.c
+++ b/net/ieee802154/6lowpan/rx.c
@@ -262,7 +262,7 @@ static inline bool lowpan_rx_h_check(struct sk_buff *skb)
/* check on ieee802154 conform 6LoWPAN header */
if (!ieee802154_is_data(fc) ||
- !ieee802154_is_intra_pan(fc))
+ !ieee802154_skb_is_intra_pan_addressing(fc, skb))
return false;
/* check if we can dereference the dispatch */
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index e459afd16..dbb476d7d 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -9,6 +9,7 @@
*/
#include <net/6lowpan.h>
+#include <net/ndisc.h>
#include <net/ieee802154_netdev.h>
#include <net/mac802154.h>
@@ -17,19 +18,9 @@
#define LOWPAN_FRAG1_HEAD_SIZE 0x4
#define LOWPAN_FRAGN_HEAD_SIZE 0x5
-/* don't save pan id, it's intra pan */
-struct lowpan_addr {
- u8 mode;
- union {
- /* IPv6 needs big endian here */
- __be64 extended_addr;
- __be16 short_addr;
- } u;
-};
-
struct lowpan_addr_info {
- struct lowpan_addr daddr;
- struct lowpan_addr saddr;
+ struct ieee802154_addr daddr;
+ struct ieee802154_addr saddr;
};
static inline struct
@@ -48,12 +39,14 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb)
* RAW/DGRAM sockets.
*/
int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
- unsigned short type, const void *_daddr,
- const void *_saddr, unsigned int len)
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
{
- const u8 *saddr = _saddr;
- const u8 *daddr = _daddr;
- struct lowpan_addr_info *info;
+ struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr;
+ struct lowpan_addr_info *info = lowpan_skb_priv(skb);
+ struct lowpan_802154_neigh *llneigh = NULL;
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct neighbour *n;
/* TODO:
* if this package isn't ipv6 one, where should it be routed?
@@ -61,21 +54,50 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
if (type != ETH_P_IPV6)
return 0;
- if (!saddr)
- saddr = ldev->dev_addr;
+ /* intra-pan communication */
+ info->saddr.pan_id = wpan_dev->pan_id;
+ info->daddr.pan_id = info->saddr.pan_id;
- raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8);
- raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8);
+ if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) {
+ info->daddr.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
+ info->daddr.mode = IEEE802154_ADDR_SHORT;
+ } else {
+ __le16 short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+
+ n = neigh_lookup(&nd_tbl, &hdr->daddr, ldev);
+ if (n) {
+ llneigh = lowpan_802154_neigh(neighbour_priv(n));
+ read_lock_bh(&n->lock);
+ short_addr = llneigh->short_addr;
+ read_unlock_bh(&n->lock);
+ }
- info = lowpan_skb_priv(skb);
+ if (llneigh &&
+ lowpan_802154_is_valid_src_short_addr(short_addr)) {
+ info->daddr.short_addr = short_addr;
+ info->daddr.mode = IEEE802154_ADDR_SHORT;
+ } else {
+ info->daddr.mode = IEEE802154_ADDR_LONG;
+ ieee802154_be64_to_le64(&info->daddr.extended_addr,
+ daddr);
+ }
- /* TODO: Currently we only support extended_addr */
- info->daddr.mode = IEEE802154_ADDR_LONG;
- memcpy(&info->daddr.u.extended_addr, daddr,
- sizeof(info->daddr.u.extended_addr));
- info->saddr.mode = IEEE802154_ADDR_LONG;
- memcpy(&info->saddr.u.extended_addr, saddr,
- sizeof(info->daddr.u.extended_addr));
+ if (n)
+ neigh_release(n);
+ }
+
+ if (!saddr) {
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) {
+ info->saddr.mode = IEEE802154_ADDR_SHORT;
+ info->saddr.short_addr = wpan_dev->short_addr;
+ } else {
+ info->saddr.mode = IEEE802154_ADDR_LONG;
+ info->saddr.extended_addr = wpan_dev->extended_addr;
+ }
+ } else {
+ info->saddr.mode = IEEE802154_ADDR_LONG;
+ ieee802154_be64_to_le64(&info->saddr.extended_addr, saddr);
+ }
return 0;
}
@@ -209,47 +231,26 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev,
u16 *dgram_size, u16 *dgram_offset)
{
struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr;
- struct ieee802154_addr sa, da;
struct ieee802154_mac_cb *cb = mac_cb_init(skb);
struct lowpan_addr_info info;
- void *daddr, *saddr;
memcpy(&info, lowpan_skb_priv(skb), sizeof(info));
- /* TODO: Currently we only support extended_addr */
- daddr = &info.daddr.u.extended_addr;
- saddr = &info.saddr.u.extended_addr;
-
*dgram_size = skb->len;
- lowpan_header_compress(skb, ldev, daddr, saddr);
+ lowpan_header_compress(skb, ldev, &info.daddr, &info.saddr);
/* dgram_offset = (saved bytes after compression) + lowpan header len */
*dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb);
cb->type = IEEE802154_FC_TYPE_DATA;
- /* prepare wpan address data */
- sa.mode = IEEE802154_ADDR_LONG;
- sa.pan_id = wpan_dev->pan_id;
- sa.extended_addr = ieee802154_devaddr_from_raw(saddr);
-
- /* intra-PAN communications */
- da.pan_id = sa.pan_id;
-
- /* if the destination address is the broadcast address, use the
- * corresponding short address
- */
- if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) {
- da.mode = IEEE802154_ADDR_SHORT;
- da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
+ if (info.daddr.mode == IEEE802154_ADDR_SHORT &&
+ ieee802154_is_broadcast_short_addr(info.daddr.short_addr))
cb->ackreq = false;
- } else {
- da.mode = IEEE802154_ADDR_LONG;
- da.extended_addr = ieee802154_devaddr_from_raw(daddr);
+ else
cb->ackreq = wpan_dev->ackreq;
- }
- return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, &da,
- &sa, 0);
+ return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev,
+ &info.daddr, &info.saddr, 0);
}
netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index c35fdfa6d..cb7176cd4 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -140,6 +140,8 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
rdev->wpan_phy.dev.class = &wpan_phy_class;
rdev->wpan_phy.dev.platform_data = rdev;
+ wpan_phy_net_set(&rdev->wpan_phy, &init_net);
+
init_waitqueue_head(&rdev->dev_wait);
return &rdev->wpan_phy;
@@ -207,6 +209,49 @@ void wpan_phy_free(struct wpan_phy *phy)
}
EXPORT_SYMBOL(wpan_phy_free);
+int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
+ struct net *net)
+{
+ struct wpan_dev *wpan_dev;
+ int err = 0;
+
+ list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
+ if (!wpan_dev->netdev)
+ continue;
+ wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+ err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d");
+ if (err)
+ break;
+ wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
+ }
+
+ if (err) {
+ /* failed -- clean up to old netns */
+ net = wpan_phy_net(&rdev->wpan_phy);
+
+ list_for_each_entry_continue_reverse(wpan_dev,
+ &rdev->wpan_dev_list,
+ list) {
+ if (!wpan_dev->netdev)
+ continue;
+ wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+ err = dev_change_net_namespace(wpan_dev->netdev, net,
+ "wpan%d");
+ WARN_ON(err);
+ wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
+ }
+
+ return err;
+ }
+
+ wpan_phy_net_set(&rdev->wpan_phy, net);
+
+ err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev));
+ WARN_ON(err);
+
+ return 0;
+}
+
void cfg802154_dev_free(struct cfg802154_registered_device *rdev)
{
kfree(rdev);
@@ -286,14 +331,34 @@ static struct notifier_block cfg802154_netdev_notifier = {
.notifier_call = cfg802154_netdev_notifier_call,
};
+static void __net_exit cfg802154_pernet_exit(struct net *net)
+{
+ struct cfg802154_registered_device *rdev;
+
+ rtnl_lock();
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ if (net_eq(wpan_phy_net(&rdev->wpan_phy), net))
+ WARN_ON(cfg802154_switch_netns(rdev, &init_net));
+ }
+ rtnl_unlock();
+}
+
+static struct pernet_operations cfg802154_pernet_ops = {
+ .exit = cfg802154_pernet_exit,
+};
+
static int __init wpan_phy_class_init(void)
{
int rc;
- rc = wpan_phy_sysfs_init();
+ rc = register_pernet_device(&cfg802154_pernet_ops);
if (rc)
goto err;
+ rc = wpan_phy_sysfs_init();
+ if (rc)
+ goto err_sysfs;
+
rc = register_netdevice_notifier(&cfg802154_netdev_notifier);
if (rc)
goto err_nl;
@@ -315,6 +380,8 @@ err_notifier:
unregister_netdevice_notifier(&cfg802154_netdev_notifier);
err_nl:
wpan_phy_sysfs_exit();
+err_sysfs:
+ unregister_pernet_device(&cfg802154_pernet_ops);
err:
return rc;
}
@@ -326,6 +393,7 @@ static void __exit wpan_phy_class_exit(void)
ieee802154_nl_exit();
unregister_netdevice_notifier(&cfg802154_netdev_notifier);
wpan_phy_sysfs_exit();
+ unregister_pernet_device(&cfg802154_pernet_ops);
}
module_exit(wpan_phy_class_exit);
diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h
index 231fade95..81141f58d 100644
--- a/net/ieee802154/core.h
+++ b/net/ieee802154/core.h
@@ -38,6 +38,8 @@ wpan_phy_to_rdev(struct wpan_phy *wpan_phy)
extern struct list_head cfg802154_rdev_list;
extern int cfg802154_rdev_list_generation;
+int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
+ struct net *net);
/* free object */
void cfg802154_dev_free(struct cfg802154_registered_device *rdev);
struct cfg802154_registered_device *
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 116187b5c..d90a4ed5b 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -80,7 +80,8 @@ __cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs)
list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
struct wpan_dev *wpan_dev;
- /* TODO netns compare */
+ if (wpan_phy_net(&rdev->wpan_phy) != netns)
+ continue;
if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx)
continue;
@@ -175,7 +176,8 @@ __cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs)
if (!rdev)
return ERR_PTR(-ENODEV);
- /* TODO netns compare */
+ if (netns != wpan_phy_net(&rdev->wpan_phy))
+ return ERR_PTR(-ENODEV);
return rdev;
}
@@ -233,6 +235,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
[NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 },
+ [NL802154_ATTR_PID] = { .type = NLA_U32 },
+ [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 },
#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
[NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, },
[NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, },
@@ -590,7 +594,6 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
struct cfg802154_registered_device *rdev;
int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]);
- /* TODO netns */
netdev = __dev_get_by_index(&init_net, ifidx);
if (!netdev)
return -ENODEV;
@@ -629,7 +632,8 @@ nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb)
}
list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
- /* TODO net ns compare */
+ if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk)))
+ continue;
if (++idx <= state->start)
continue;
if (state->filter_wpan_phy != -1 &&
@@ -871,7 +875,8 @@ nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
rtnl_lock();
list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
- /* TODO netns compare */
+ if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk)))
+ continue;
if (wp_idx < wp_start) {
wp_idx++;
continue;
@@ -1271,6 +1276,37 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info)
return rdev_set_ackreq_default(rdev, wpan_dev, ackreq);
}
+static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net *net;
+ int err;
+
+ if (info->attrs[NL802154_ATTR_PID]) {
+ u32 pid = nla_get_u32(info->attrs[NL802154_ATTR_PID]);
+
+ net = get_net_ns_by_pid(pid);
+ } else if (info->attrs[NL802154_ATTR_NETNS_FD]) {
+ u32 fd = nla_get_u32(info->attrs[NL802154_ATTR_NETNS_FD]);
+
+ net = get_net_ns_by_fd(fd);
+ } else {
+ return -EINVAL;
+ }
+
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ err = 0;
+
+ /* check if anything to do */
+ if (!net_eq(wpan_phy_net(&rdev->wpan_phy), net))
+ err = cfg802154_switch_netns(rdev, net);
+
+ put_net(net);
+ return err;
+}
+
#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = {
[NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 },
@@ -2262,6 +2298,14 @@ static const struct genl_ops nl802154_ops[] = {
NL802154_FLAG_NEED_RTNL,
},
{
+ .cmd = NL802154_CMD_SET_WPAN_PHY_NETNS,
+ .doit = nl802154_wpan_phy_netns,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
.cmd = NL802154_CMD_SET_PAN_ID,
.doit = nl802154_set_pan_id,
.policy = nl802154_policy,
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index eb51c43c2..c4ed105ac 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -532,6 +532,22 @@ config TCP_CONG_VEGAS
window. TCP Vegas should provide less packet loss, but it is
not as aggressive as TCP Reno.
+config TCP_CONG_NV
+ tristate "TCP NV"
+ default n
+ ---help---
+ TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+ 10G networks, measurement noise introduced by LRO, GRO and interrupt
+ coalescence. In addition, it will decrease its cwnd multiplicatively
+ instead of linearly.
+
+ Note that in general congestion avoidance (cwnd decreased when # packets
+ queued grows) cannot coexist with congestion control (cwnd decreased only
+ when there is packet loss) due to fairness issues. One scenario when they
+ can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+
+ For further details see http://www.brakmo.org/networking/tcp-nv/
+
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
default n
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bfa133691..24629b6f5 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d39e9e47a..55513e654 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -73,7 +73,7 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/kmod.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
@@ -1916,6 +1916,3 @@ static int __init ipv4_proc_init(void)
return 0;
}
#endif /* CONFIG_PROC_FS */
-
-MODULE_ALIAS_NETPROTO(PF_INET);
-
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 40d6b8771..72d6f056d 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -135,76 +135,6 @@ int cipso_v4_rbm_strictvalid = 1;
*/
/**
- * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
- * @bitmap: the bitmap
- * @bitmap_len: length in bits
- * @offset: starting offset
- * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
- *
- * Description:
- * Starting at @offset, walk the bitmap from left to right until either the
- * desired bit is found or we reach the end. Return the bit offset, -1 if
- * not found, or -2 if error.
- */
-static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
- u32 bitmap_len,
- u32 offset,
- u8 state)
-{
- u32 bit_spot;
- u32 byte_offset;
- unsigned char bitmask;
- unsigned char byte;
-
- /* gcc always rounds to zero when doing integer division */
- byte_offset = offset / 8;
- byte = bitmap[byte_offset];
- bit_spot = offset;
- bitmask = 0x80 >> (offset % 8);
-
- while (bit_spot < bitmap_len) {
- if ((state && (byte & bitmask) == bitmask) ||
- (state == 0 && (byte & bitmask) == 0))
- return bit_spot;
-
- bit_spot++;
- bitmask >>= 1;
- if (bitmask == 0) {
- byte = bitmap[++byte_offset];
- bitmask = 0x80;
- }
- }
-
- return -1;
-}
-
-/**
- * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
- * @bitmap: the bitmap
- * @bit: the bit
- * @state: if non-zero, set the bit (1) else clear the bit (0)
- *
- * Description:
- * Set a single bit in the bitmask. Returns zero on success, negative values
- * on error.
- */
-static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
- u32 bit,
- u8 state)
-{
- u32 byte_spot;
- u8 bitmask;
-
- /* gcc always rounds to zero when doing integer division */
- byte_spot = bit / 8;
- bitmask = 0x80 >> (bit % 8);
- if (state)
- bitmap[byte_spot] |= bitmask;
- else
- bitmap[byte_spot] &= ~bitmask;
-}
-
-/**
* cipso_v4_cache_entry_free - Frees a cache entry
* @entry: the entry to free
*
@@ -840,10 +770,10 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
cipso_cat_size = doi_def->map.std->cat.cipso_size;
cipso_array = doi_def->map.std->cat.cipso;
for (;;) {
- cat = cipso_v4_bitmap_walk(bitmap,
- bitmap_len_bits,
- cat + 1,
- 1);
+ cat = netlbl_bitmap_walk(bitmap,
+ bitmap_len_bits,
+ cat + 1,
+ 1);
if (cat < 0)
break;
if (cat >= cipso_cat_size ||
@@ -909,7 +839,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
}
if (net_spot >= net_clen_bits)
return -ENOSPC;
- cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
+ netlbl_bitmap_setbit(net_cat, net_spot, 1);
if (net_spot > net_spot_max)
net_spot_max = net_spot;
@@ -951,10 +881,10 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
}
for (;;) {
- net_spot = cipso_v4_bitmap_walk(net_cat,
- net_clen_bits,
- net_spot + 1,
- 1);
+ net_spot = netlbl_bitmap_walk(net_cat,
+ net_clen_bits,
+ net_spot + 1,
+ 1);
if (net_spot < 0) {
if (net_spot == -2)
return -EFAULT;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e333bc86b..062a67ca9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1834,7 +1834,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
struct sk_buff *skb;
int err = -ENOBUFS;
- skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL);
if (!skb)
goto errout;
@@ -1846,7 +1846,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
kfree_skb(skb);
goto errout;
}
- rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC);
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL);
return;
errout:
if (err < 0)
@@ -1903,7 +1903,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
}
err = -ENOBUFS;
- skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
if (!skb)
goto errout;
@@ -2027,16 +2027,16 @@ static void inet_forward_change(struct net *net)
for_each_netdev(net, dev) {
struct in_device *in_dev;
+
if (on)
dev_disable_lro(dev);
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
+
+ in_dev = __in_dev_get_rtnl(dev);
if (in_dev) {
IN_DEV_CONF_SET(in_dev, FORWARDING, on);
inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
dev->ifindex, &in_dev->cnf);
}
- rcu_read_unlock();
}
}
@@ -2232,7 +2232,7 @@ static struct devinet_sysctl_table {
};
static int __devinet_sysctl_register(struct net *net, char *dev_name,
- struct ipv4_devconf *p)
+ int ifindex, struct ipv4_devconf *p)
{
int i;
struct devinet_sysctl_table *t;
@@ -2255,6 +2255,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
goto free;
p->sysctl = t;
+
+ inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
return 0;
free:
@@ -2286,7 +2288,7 @@ static int devinet_sysctl_register(struct in_device *idev)
if (err)
return err;
err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
- &idev->cnf);
+ idev->dev->ifindex, &idev->cnf);
if (err)
neigh_sysctl_unregister(idev->arp_parms);
return err;
@@ -2347,11 +2349,12 @@ static __net_init int devinet_init_net(struct net *net)
}
#ifdef CONFIG_SYSCTL
- err = __devinet_sysctl_register(net, "all", all);
+ err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
if (err < 0)
goto err_reg_all;
- err = __devinet_sysctl_register(net, "default", dflt);
+ err = __devinet_sysctl_register(net, "default",
+ NETCONFA_IFINDEX_DEFAULT, dflt);
if (err < 0)
goto err_reg_dflt;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ef2ebeb89..1b25daf8c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -509,6 +509,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
if (!dev)
return -ENODEV;
cfg->fc_oif = dev->ifindex;
+ cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
struct in_ifaddr *ifa;
struct in_device *in_dev = __in_dev_get_rtnl(dev);
@@ -1027,7 +1028,7 @@ no_promotions:
* First of all, we scan fib_info list searching
* for stray nexthop entries, then ignite fib_flush.
*/
- if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+ if (fib_sync_down_addr(dev, ifa->ifa_local))
fib_flush(dev_net(dev));
}
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f2bda9e89..6e9ea69e5 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -76,6 +76,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
{
int err = -EAGAIN;
struct fib_table *tbl;
+ u32 tb_id;
switch (rule->action) {
case FR_ACT_TO_TBL:
@@ -94,7 +95,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
rcu_read_lock();
- tbl = fib_get_table(rule->fr_net, rule->table);
+ tb_id = fib_rule_get_table(rule, arg);
+ tbl = fib_get_table(rule->fr_net, tb_id);
if (tbl)
err = fib_table_lookup(tbl, &flp->u.ip4,
(struct fib_result *)arg->result,
@@ -180,7 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (err)
goto errout;
- if (rule->table == RT_TABLE_UNSPEC) {
+ if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) {
if (rule->action == FR_ACT_TO_TBL) {
struct fib_table *table;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 539fa264e..e9f56225e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_priority = cfg->fc_priority;
fi->fib_prefsrc = cfg->fc_prefsrc;
fi->fib_type = cfg->fc_type;
+ fi->fib_tb_id = cfg->fc_table;
fi->fib_nhs = nhs;
change_nexthops(fi) {
@@ -1337,18 +1338,21 @@ nla_put_failure:
* referring to it.
* - device went down -> we must shutdown all nexthops going via it.
*/
-int fib_sync_down_addr(struct net *net, __be32 local)
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct net *net = dev_net(dev);
+ int tb_id = l3mdev_fib_table(dev);
struct fib_info *fi;
if (!fib_info_laddrhash || local == 0)
return 0;
hlist_for_each_entry(fi, head, fib_lhash) {
- if (!net_eq(fi->fib_net, net))
+ if (!net_eq(fi->fib_net, net) ||
+ fi->fib_tb_id != tb_id)
continue;
if (fi->fib_prefsrc == local) {
fi->fib_flags |= RTNH_F_DEAD;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index febca0f10..e2ffc2a5c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -249,7 +249,7 @@ static inline unsigned long get_index(t_key key, struct key_vector *kv)
* index into the parent's child array. That is, they will be used to find
* 'n' among tp's children.
*
- * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
+ * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
* for the node n.
*
* All the bits we have seen so far are significant to the node n. The rest
@@ -258,7 +258,7 @@ static inline unsigned long get_index(t_key key, struct key_vector *kv)
* The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
* n's child array, and will of course be different for each child.
*
- * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
+ * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
* at this point.
*/
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 5f9207c03..321d57f82 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -129,6 +129,36 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+
+ int prot;
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ prot = IPPROTO_IPIP;
+ break;
+ case 6:
+ prot = IPPROTO_IPV6;
+ break;
+ default:
+ goto drop;
+ }
+
+ if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
+ goto drop;
+
+ return -prot;
+ }
+
+ default: /* Undefined version */
+ goto drop;
+ }
+
optlen = guehdr->hlen << 2;
len += optlen;
@@ -289,6 +319,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
int flush = 1;
struct fou *fou = fou_from_sock(sk);
struct gro_remcsum grc;
+ u8 proto;
skb_gro_remcsum_init(&grc);
@@ -302,6 +333,25 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
goto out;
}
+ switch (guehdr->version) {
+ case 0:
+ break;
+ case 1:
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ proto = IPPROTO_IPIP;
+ break;
+ case 6:
+ proto = IPPROTO_IPV6;
+ break;
+ default:
+ goto out;
+ }
+ goto next_proto;
+ default:
+ goto out;
+ }
+
optlen = guehdr->hlen << 2;
len += optlen;
@@ -370,6 +420,10 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
}
}
+ proto = guehdr->proto_ctype;
+
+next_proto:
+
/* We can clear the encap_mark for GUE as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
* header to the outer L3 tunnel header, or we are are simply
@@ -383,7 +437,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[guehdr->proto_ctype]);
+ ops = rcu_dereference(offloads[proto]);
if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
goto out_unlock;
@@ -404,13 +458,30 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
const struct net_offload **offloads;
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
const struct net_offload *ops;
- unsigned int guehlen;
+ unsigned int guehlen = 0;
u8 proto;
int err = -ENOENT;
- proto = guehdr->proto_ctype;
-
- guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+ switch (guehdr->version) {
+ case 0:
+ proto = guehdr->proto_ctype;
+ guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+ break;
+ case 1:
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ proto = IPPROTO_IPIP;
+ break;
+ case 6:
+ proto = IPPROTO_IPV6;
+ break;
+ default:
+ return err;
+ }
+ break;
+ default:
+ return err;
+ }
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index de1d119a4..b798862b6 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -117,6 +117,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if ((*(u8 *)options & 0xF0) != 0x40)
hdr_len += 4;
}
+ tpi->hdr_len = hdr_len;
return hdr_len;
}
EXPORT_SYMBOL(gre_parse_header);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fa8c39804..61a9deec2 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -603,7 +603,7 @@ static void reqsk_timer_handler(unsigned long data)
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
- mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
+ mod_timer(&req->rsk_timer, jiffies + timeo);
return;
}
drop:
@@ -617,8 +617,9 @@ static void reqsk_queue_hash_req(struct request_sock *req,
req->num_timeout = 0;
req->sk = NULL;
- setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
- mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+ setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler,
+ (unsigned long)req);
+ mod_timer(&req->rsk_timer, jiffies + timeout);
inet_ehash_insert(req_to_sk(req), NULL);
/* before letting lookups find us, make sure all req fields
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 25af12436..38c2c47fe 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -44,6 +44,7 @@ struct inet_diag_entry {
u16 dport;
u16 family;
u16 userlocks;
+ u32 ifindex;
};
static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -571,6 +572,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
yes = 0;
break;
}
+ case INET_DIAG_BC_DEV_COND: {
+ u32 ifindex;
+
+ ifindex = *((const u32 *)(op + 1));
+ if (ifindex != entry->ifindex)
+ yes = 0;
+ break;
+ }
}
if (yes) {
@@ -613,6 +622,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
entry_fill_addrs(&entry, sk);
entry.sport = inet->inet_num;
entry.dport = ntohs(inet->inet_dport);
+ entry.ifindex = sk->sk_bound_dev_if;
entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
return inet_diag_bc_run(bc, &entry);
@@ -636,6 +646,17 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
}
+/* data is u32 ifindex */
+static bool valid_devcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ /* Check ifindex space. */
+ *min_len += sizeof(u32);
+ if (len < *min_len)
+ return false;
+
+ return true;
+}
/* Validate an inet_diag_hostcond. */
static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
int *min_len)
@@ -700,6 +721,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
if (!valid_hostcond(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_DEV_COND:
+ if (!valid_devcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
case INET_DIAG_BC_D_GE:
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 3a88b0c73..b5e9317ea 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -355,7 +355,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
{
struct inet_frag_queue *q;
- if (frag_mem_limit(nf) > nf->high_thresh) {
+ if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
inet_frag_schedule_worker(f);
return NULL;
}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 206581674..ddcd56c08 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -188,7 +188,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
- setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
+ setup_pinned_timer(&tw->tw_timer, tw_timer_handler,
+ (unsigned long)tw);
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
@@ -248,7 +249,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
tw->tw_kill = timeo <= 4*HZ;
if (!rearm) {
- BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo));
+ BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
atomic_inc(&tw->tw_dr->tw_count);
} else {
mod_timer_pending(&tw->tw_timer, jiffies + timeo);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index cbfb1808f..8b4ffd216 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -54,7 +54,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
if (skb->ignore_df)
return false;
- if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
return false;
return true;
@@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb)
if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
- IPCB(skb)->flags |= IPSKB_FORWARDED;
+ IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1d000af7f..113cc43df 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -138,6 +138,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
const struct iphdr *iph;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ unsigned int data_len = 0;
struct ip_tunnel *t;
switch (type) {
@@ -163,6 +164,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
return;
+ data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
case ICMP_REDIRECT:
@@ -181,6 +183,13 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
if (!t)
return;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tpi->proto == htons(ETH_P_IPV6) &&
+ !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
+ type, data_len))
+ return;
+#endif
+
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
return;
@@ -361,7 +370,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->parms.o_flags, proto, tunnel->parms.o_key,
htonl(tunnel->o_seqno));
- skb_set_inner_protocol(skb, proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
@@ -837,17 +845,19 @@ out:
return ipgre_tunnel_validate(tb, data);
}
-static void ipgre_netlink_parms(struct net_device *dev,
+static int ipgre_netlink_parms(struct net_device *dev,
struct nlattr *data[],
struct nlattr *tb[],
struct ip_tunnel_parm *parms)
{
+ struct ip_tunnel *t = netdev_priv(dev);
+
memset(parms, 0, sizeof(*parms));
parms->iph.protocol = IPPROTO_GRE;
if (!data)
- return;
+ return 0;
if (data[IFLA_GRE_LINK])
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
@@ -876,16 +886,26 @@ static void ipgre_netlink_parms(struct net_device *dev,
if (data[IFLA_GRE_TOS])
parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
- if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
+ if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
+ if (t->ignore_df)
+ return -EINVAL;
parms->iph.frag_off = htons(IP_DF);
+ }
if (data[IFLA_GRE_COLLECT_METADATA]) {
- struct ip_tunnel *t = netdev_priv(dev);
-
t->collect_md = true;
if (dev->type == ARPHRD_IPGRE)
dev->type = ARPHRD_NONE;
}
+
+ if (data[IFLA_GRE_IGNORE_DF]) {
+ if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
+ && (parms->iph.frag_off & htons(IP_DF)))
+ return -EINVAL;
+ t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
+ }
+
+ return 0;
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -956,16 +976,19 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
- int err = ip_tunnel_encap_setup(t, &ipencap);
+ err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- ipgre_netlink_parms(dev, data, tb, &p);
+ err = ipgre_netlink_parms(dev, data, tb, &p);
+ if (err < 0)
+ return err;
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -974,16 +997,19 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
- int err = ip_tunnel_encap_setup(t, &ipencap);
+ err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- ipgre_netlink_parms(dev, data, tb, &p);
+ err = ipgre_netlink_parms(dev, data, tb, &p);
+ if (err < 0)
+ return err;
return ip_tunnel_changelink(dev, tb, &p);
}
@@ -1020,6 +1046,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_GRE_COLLECT_METADATA */
nla_total_size(0) +
+ /* IFLA_GRE_IGNORE_DF */
+ nla_total_size(1) +
0;
}
@@ -1053,6 +1081,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
t->encap.flags))
goto nla_put_failure;
+ if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
+ goto nla_put_failure;
+
if (t->collect_md) {
if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
goto nla_put_failure;
@@ -1080,6 +1111,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
[IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
+ [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4b351af3e..d6feabb03 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ struct net_device *dev = skb->dev;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev);
+ iph->tos, dev);
if (unlikely(err)) {
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
@@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4bd492163..dde37fb34 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -223,9 +223,11 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *segs;
int ret = 0;
- /* common case: locally created skb or seglen is <= mtu */
- if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
- skb_gso_network_seglen(skb) <= mtu)
+ /* common case: fragmentation of segments is not allowed,
+ * or seglen is <= mtu
+ */
+ if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) ||
+ skb_gso_validate_mtu(skb, mtu))
return ip_finish_output2(net, sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d8f5e0a26..95649ebd2 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -682,7 +682,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
df = tnl_params->frag_off;
- if (skb->protocol == htons(ETH_P_IP))
+ if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
df |= (inner_iph->frag_off&htons(IP_DF));
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index afd6b5968..0f227db0e 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -63,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
int pkt_len = skb->len - skb_inner_network_offset(skb);
struct net *net = dev_net(rt->dst.dev);
struct net_device *dev = skb->dev;
+ int skb_iif = skb->skb_iif;
struct iphdr *iph;
int err;
@@ -72,6 +73,16 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ if (skb_iif && !(df & htons(IP_DF))) {
+ /* Arrived from an ingress interface, got encapsulated, with
+ * fragmentation of encapulating frames allowed.
+ * If skb is gso, the resulting encapsulated network segments
+ * may exceed dst mtu.
+ * Allow IP Fragmentation of segments.
+ */
+ IPCB(skb)->flags |= IPSKB_FRAG_SEGS;
+ }
+
/* Push down and install the IP header. */
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index cc701fa70..5d7944f39 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
struct net_device *dev;
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
+ struct xfrm_mode *inner_mode;
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
u32 orig_mark = skb->mark;
int ret;
@@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
}
x = xfrm_input_state(skb);
- family = x->inner_mode->afinfo->family;
+
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->afinfo->family;
skb->mark = be32_to_cpu(tunnel->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 978370132..4ae3f8e6c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -148,14 +148,14 @@ static int ipip_err(struct sk_buff *skb, u32 info)
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->parms.link, 0, IPPROTO_IPIP, 0);
+ t->parms.link, 0, iph->protocol, 0);
err = 0;
goto out;
}
if (type == ICMP_REDIRECT) {
ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
- IPPROTO_IPIP, 0);
+ iph->protocol, 0);
err = 0;
goto out;
}
@@ -177,12 +177,19 @@ out:
return err;
}
-static const struct tnl_ptk_info tpi = {
+static const struct tnl_ptk_info ipip_tpi = {
/* no tunnel info required for ipip. */
.proto = htons(ETH_P_IP),
};
-static int ipip_rcv(struct sk_buff *skb)
+#if IS_ENABLED(CONFIG_MPLS)
+static const struct tnl_ptk_info mplsip_tpi = {
+ /* no tunnel info required for mplsip. */
+ .proto = htons(ETH_P_MPLS_UC),
+};
+#endif
+
+static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
@@ -193,11 +200,23 @@ static int ipip_rcv(struct sk_buff *skb)
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
iph->saddr, iph->daddr, 0);
if (tunnel) {
+ const struct tnl_ptk_info *tpi;
+
+ if (tunnel->parms.iph.protocol != ipproto &&
+ tunnel->parms.iph.protocol != 0)
+ goto drop;
+
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- if (iptunnel_pull_header(skb, 0, tpi.proto, false))
+#if IS_ENABLED(CONFIG_MPLS)
+ if (ipproto == IPPROTO_MPLS)
+ tpi = &mplsip_tpi;
+ else
+#endif
+ tpi = &ipip_tpi;
+ if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
+ return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
}
return -1;
@@ -207,24 +226,51 @@ drop:
return 0;
}
+static int ipip_rcv(struct sk_buff *skb)
+{
+ return ipip_tunnel_rcv(skb, IPPROTO_IPIP);
+}
+
+#if IS_ENABLED(CONFIG_MPLS)
+static int mplsip_rcv(struct sk_buff *skb)
+{
+ return ipip_tunnel_rcv(skb, IPPROTO_MPLS);
+}
+#endif
+
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
*/
-static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tiph = &tunnel->parms.iph;
+ u8 ipproto;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ipproto = IPPROTO_IPIP;
+ break;
+#if IS_ENABLED(CONFIG_MPLS)
+ case htons(ETH_P_MPLS_UC):
+ ipproto = IPPROTO_MPLS;
+ break;
+#endif
+ default:
+ goto tx_error;
+ }
- if (unlikely(skb->protocol != htons(ETH_P_IP)))
+ if (tiph->protocol != ipproto && tiph->protocol != 0)
goto tx_error;
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
goto tx_error;
- skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+ skb_set_inner_ipproto(skb, ipproto);
- ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
+ ip_tunnel_xmit(skb, dev, tiph, ipproto);
return NETDEV_TX_OK;
tx_error:
@@ -234,6 +280,20 @@ tx_error:
return NETDEV_TX_OK;
}
+static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
+{
+ switch (ipproto) {
+ case 0:
+ case IPPROTO_IPIP:
+#if IS_ENABLED(CONFIG_MPLS)
+ case IPPROTO_MPLS:
+#endif
+ return true;
+ }
+
+ return false;
+}
+
static int
ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
@@ -244,7 +304,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return -EFAULT;
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ if (p.iph.version != 4 ||
+ !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) ||
p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
return -EINVAL;
}
@@ -301,10 +362,23 @@ static int ipip_tunnel_init(struct net_device *dev)
tunnel->tun_hlen = 0;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
- tunnel->parms.iph.protocol = IPPROTO_IPIP;
return ip_tunnel_init(dev);
}
+static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ u8 proto;
+
+ if (!data || !data[IFLA_IPTUN_PROTO])
+ return 0;
+
+ proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+ if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
static void ipip_netlink_parms(struct nlattr *data[],
struct ip_tunnel_parm *parms)
{
@@ -335,6 +409,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
if (data[IFLA_IPTUN_TOS])
parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+ if (data[IFLA_IPTUN_PROTO])
+ parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
}
@@ -427,6 +504,8 @@ static size_t ipip_get_size(const struct net_device *dev)
nla_total_size(1) +
/* IFLA_IPTUN_TOS */
nla_total_size(1) +
+ /* IFLA_IPTUN_PROTO */
+ nla_total_size(1) +
/* IFLA_IPTUN_PMTUDISC */
nla_total_size(1) +
/* IFLA_IPTUN_ENCAP_TYPE */
@@ -450,6 +529,7 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+ nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
!!(parm->iph.frag_off & htons(IP_DF))))
goto nla_put_failure;
@@ -476,6 +556,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
[IFLA_IPTUN_TTL] = { .type = NLA_U8 },
[IFLA_IPTUN_TOS] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
[IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
[IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
@@ -489,6 +570,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
.policy = ipip_policy,
.priv_size = sizeof(struct ip_tunnel),
.setup = ipip_tunnel_setup,
+ .validate = ipip_tunnel_validate,
.newlink = ipip_newlink,
.changelink = ipip_changelink,
.dellink = ip_tunnel_dellink,
@@ -503,6 +585,14 @@ static struct xfrm_tunnel ipip_handler __read_mostly = {
.priority = 1,
};
+#if IS_ENABLED(CONFIG_MPLS)
+static struct xfrm_tunnel mplsip_handler __read_mostly = {
+ .handler = mplsip_rcv,
+ .err_handler = ipip_err,
+ .priority = 1,
+};
+#endif
+
static int __net_init ipip_init_net(struct net *net)
{
return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
@@ -525,7 +615,7 @@ static int __init ipip_init(void)
{
int err;
- pr_info("ipip: IPv4 over IPv4 tunneling driver\n");
+ pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n");
err = register_pernet_device(&ipip_net_ops);
if (err < 0)
@@ -533,8 +623,15 @@ static int __init ipip_init(void)
err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
if (err < 0) {
pr_info("%s: can't register tunnel\n", __func__);
- goto xfrm_tunnel_failed;
+ goto xfrm_tunnel_ipip_failed;
+ }
+#if IS_ENABLED(CONFIG_MPLS)
+ err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS);
+ if (err < 0) {
+ pr_info("%s: can't register tunnel\n", __func__);
+ goto xfrm_tunnel_mplsip_failed;
}
+#endif
err = rtnl_link_register(&ipip_link_ops);
if (err < 0)
goto rtnl_link_failed;
@@ -543,8 +640,13 @@ out:
return err;
rtnl_link_failed:
+#if IS_ENABLED(CONFIG_MPLS)
+ xfrm4_tunnel_deregister(&mplsip_handler, AF_INET);
+xfrm_tunnel_mplsip_failed:
+
+#endif
xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
-xfrm_tunnel_failed:
+xfrm_tunnel_ipip_failed:
unregister_pernet_device(&ipip_net_ops);
goto out;
}
@@ -554,7 +656,10 @@ static void __exit ipip_fini(void)
rtnl_link_unregister(&ipip_link_ops);
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
pr_info("%s: can't deregister tunnel\n", __func__);
-
+#if IS_ENABLED(CONFIG_MPLS)
+ if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS))
+ pr_info("%s: can't deregister tunnel\n", __func__);
+#endif
unregister_pernet_device(&ipip_net_ops);
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5ad48ec77..5f006e13d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -722,6 +722,7 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache
cache->mfc_un.res.maxvif = vifi + 1;
}
}
+ cache->mfc_un.res.lastuse = jiffies;
}
static int vif_add(struct net *net, struct mr_table *mrt,
@@ -1748,7 +1749,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
vif->dev->stats.tx_bytes += skb->len;
}
- IPCB(skb)->flags |= IPSKB_FORWARDED;
+ IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS;
/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
* not only before forwarding, but after forwarding on all output
@@ -1792,6 +1793,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
vif = cache->mfc_parent;
cache->mfc_un.res.pkt++;
cache->mfc_un.res.bytes += skb->len;
+ cache->mfc_un.res.lastuse = jiffies;
if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
struct mfc_cache *cache_proxy;
@@ -2071,10 +2073,11 @@ drop:
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mfc_cache *c, struct rtmsg *rtm)
{
- int ct;
- struct rtnexthop *nhp;
- struct nlattr *mp_attr;
struct rta_mfc_stats mfcs;
+ struct nlattr *mp_attr;
+ struct rtnexthop *nhp;
+ unsigned long lastuse;
+ int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
if (c->mfc_parent >= MAXVIFS)
@@ -2103,10 +2106,15 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
nla_nest_end(skb, mp_attr);
+ lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+ lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
- if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0)
+ if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
+ nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
+ RTA_PAD))
return -EMSGSIZE;
rtm->rtm_type = RTN_MULTICAST;
@@ -2115,7 +2123,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ipmr_get_route(struct net *net, struct sk_buff *skb,
__be32 saddr, __be32 daddr,
- struct rtmsg *rtm, int nowait)
+ struct rtmsg *rtm, int nowait, u32 portid)
{
struct mfc_cache *cache;
struct mr_table *mrt;
@@ -2160,6 +2168,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
return -ENOMEM;
}
+ NETLINK_CB(skb2).portid = portid;
skb_push(skb2, sizeof(struct iphdr));
skb_reset_network_header(skb2);
iph = ip_hdr(skb2);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 2033f929a..b31df597f 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -89,22 +89,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
__be32 src_ipaddr, tgt_ipaddr;
long ret;
-#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
-
- if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
- ARPT_INV_ARPOP))
+ if (NF_INVF(arpinfo, ARPT_INV_ARPOP,
+ (arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop))
return 0;
- if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
- ARPT_INV_ARPHRD))
+ if (NF_INVF(arpinfo, ARPT_INV_ARPHRD,
+ (arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd))
return 0;
- if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
- ARPT_INV_ARPPRO))
+ if (NF_INVF(arpinfo, ARPT_INV_ARPPRO,
+ (arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro))
return 0;
- if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
- ARPT_INV_ARPHLN))
+ if (NF_INVF(arpinfo, ARPT_INV_ARPHLN,
+ (arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln))
return 0;
src_devaddr = arpptr;
@@ -115,31 +113,32 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
arpptr += dev->addr_len;
memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
- if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
- ARPT_INV_SRCDEVADDR) ||
- FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
- ARPT_INV_TGTDEVADDR))
+ if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR,
+ arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr,
+ dev->addr_len)) ||
+ NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR,
+ arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr,
+ dev->addr_len)))
return 0;
- if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
- ARPT_INV_SRCIP) ||
- FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
- ARPT_INV_TGTIP))
+ if (NF_INVF(arpinfo, ARPT_INV_SRCIP,
+ (src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr) ||
+ NF_INVF(arpinfo, ARPT_INV_TGTIP,
+ (tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr))
return 0;
/* Look for ifname matches. */
ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
- if (FWINV(ret != 0, ARPT_INV_VIA_IN))
+ if (NF_INVF(arpinfo, ARPT_INV_VIA_IN, ret != 0))
return 0;
ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
- if (FWINV(ret != 0, ARPT_INV_VIA_OUT))
+ if (NF_INVF(arpinfo, ARPT_INV_VIA_OUT, ret != 0))
return 0;
return 1;
-#undef FWINV
}
static inline int arp_checkentry(const struct arpt_arp *arp)
@@ -300,23 +299,12 @@ static inline bool unconditional(const struct arpt_entry *e)
memcmp(&e->arp, &uncond, sizeof(uncond)) == 0;
}
-static bool find_jump_target(const struct xt_table_info *t,
- const struct arpt_entry *target)
-{
- struct arpt_entry *iter;
-
- xt_entry_foreach(iter, t->entries, t->size) {
- if (iter == target)
- return true;
- }
- return false;
-}
-
/* Figures out from what hook each rule can be called: returns 0 if
* there are loops. Puts hook bitmask in comefrom.
*/
static int mark_source_chains(const struct xt_table_info *newinfo,
- unsigned int valid_hooks, void *entry0)
+ unsigned int valid_hooks, void *entry0,
+ unsigned int *offsets)
{
unsigned int hook;
@@ -389,10 +377,11 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
/* This a jump; chase it. */
+ if (!xt_find_jump_offset(offsets, newpos,
+ newinfo->number))
+ return 0;
e = (struct arpt_entry *)
(entry0 + newpos);
- if (!find_jump_target(newinfo, e))
- return 0;
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
@@ -544,6 +533,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
const struct arpt_replace *repl)
{
struct arpt_entry *iter;
+ unsigned int *offsets;
unsigned int i;
int ret = 0;
@@ -556,6 +546,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
newinfo->underflow[i] = 0xFFFFFFFF;
}
+ offsets = xt_alloc_entry_offsets(newinfo->number);
+ if (!offsets)
+ return -ENOMEM;
i = 0;
/* Walk through entries, checking offsets. */
@@ -566,17 +559,20 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
repl->underflow,
repl->valid_hooks);
if (ret != 0)
- break;
+ goto out_free;
+ if (i < repl->num_entries)
+ offsets[i] = (void *)iter - entry0;
++i;
if (strcmp(arpt_get_target(iter)->u.user.name,
XT_ERROR_TARGET) == 0)
++newinfo->stacksize;
}
if (ret != 0)
- return ret;
+ goto out_free;
+ ret = -EINVAL;
if (i != repl->num_entries)
- return -EINVAL;
+ goto out_free;
/* Check hooks all assigned */
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
@@ -584,13 +580,16 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
if (!(repl->valid_hooks & (1 << i)))
continue;
if (newinfo->hook_entry[i] == 0xFFFFFFFF)
- return -EINVAL;
+ goto out_free;
if (newinfo->underflow[i] == 0xFFFFFFFF)
- return -EINVAL;
+ goto out_free;
}
- if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
- return -ELOOP;
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
+ ret = -ELOOP;
+ goto out_free;
+ }
+ kvfree(offsets);
/* Finally, each sanity check must pass */
i = 0;
@@ -611,6 +610,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
}
return ret;
+ out_free:
+ kvfree(offsets);
+ return ret;
}
static void get_counters(const struct xt_table_info *t,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 54906e0e8..f993545a3 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -58,32 +58,31 @@ ip_packet_match(const struct iphdr *ip,
{
unsigned long ret;
-#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
-
- if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
- IPT_INV_SRCIP) ||
- FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
- IPT_INV_DSTIP))
+ if (NF_INVF(ipinfo, IPT_INV_SRCIP,
+ (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) ||
+ NF_INVF(ipinfo, IPT_INV_DSTIP,
+ (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr))
return false;
ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
- if (FWINV(ret != 0, IPT_INV_VIA_IN))
+ if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0))
return false;
ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
- if (FWINV(ret != 0, IPT_INV_VIA_OUT))
+ if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0))
return false;
/* Check specific protocol */
if (ipinfo->proto &&
- FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO))
+ NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto))
return false;
/* If we have a fragment rule but the packet is not a fragment
* then we return zero */
- if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG))
+ if (NF_INVF(ipinfo, IPT_INV_FRAG,
+ (ipinfo->flags & IPT_F_FRAG) && !isfrag))
return false;
return true;
@@ -122,7 +121,6 @@ static inline bool unconditional(const struct ipt_entry *e)
return e->target_offset == sizeof(struct ipt_entry) &&
memcmp(&e->ip, &uncond, sizeof(uncond)) == 0;
-#undef FWINV
}
/* for const-correctness */
@@ -375,23 +373,12 @@ ipt_do_table(struct sk_buff *skb,
else return verdict;
}
-static bool find_jump_target(const struct xt_table_info *t,
- const struct ipt_entry *target)
-{
- struct ipt_entry *iter;
-
- xt_entry_foreach(iter, t->entries, t->size) {
- if (iter == target)
- return true;
- }
- return false;
-}
-
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
mark_source_chains(const struct xt_table_info *newinfo,
- unsigned int valid_hooks, void *entry0)
+ unsigned int valid_hooks, void *entry0,
+ unsigned int *offsets)
{
unsigned int hook;
@@ -460,10 +447,11 @@ mark_source_chains(const struct xt_table_info *newinfo,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
/* This a jump; chase it. */
+ if (!xt_find_jump_offset(offsets, newpos,
+ newinfo->number))
+ return 0;
e = (struct ipt_entry *)
(entry0 + newpos);
- if (!find_jump_target(newinfo, e))
- return 0;
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
@@ -696,6 +684,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ipt_replace *repl)
{
struct ipt_entry *iter;
+ unsigned int *offsets;
unsigned int i;
int ret = 0;
@@ -708,6 +697,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
newinfo->underflow[i] = 0xFFFFFFFF;
}
+ offsets = xt_alloc_entry_offsets(newinfo->number);
+ if (!offsets)
+ return -ENOMEM;
i = 0;
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter, entry0, newinfo->size) {
@@ -717,15 +709,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
repl->underflow,
repl->valid_hooks);
if (ret != 0)
- return ret;
+ goto out_free;
+ if (i < repl->num_entries)
+ offsets[i] = (void *)iter - entry0;
++i;
if (strcmp(ipt_get_target(iter)->u.user.name,
XT_ERROR_TARGET) == 0)
++newinfo->stacksize;
}
+ ret = -EINVAL;
if (i != repl->num_entries)
- return -EINVAL;
+ goto out_free;
/* Check hooks all assigned */
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
@@ -733,13 +728,16 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
if (!(repl->valid_hooks & (1 << i)))
continue;
if (newinfo->hook_entry[i] == 0xFFFFFFFF)
- return -EINVAL;
+ goto out_free;
if (newinfo->underflow[i] == 0xFFFFFFFF)
- return -EINVAL;
+ goto out_free;
}
- if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
- return -ELOOP;
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
+ ret = -ELOOP;
+ goto out_free;
+ }
+ kvfree(offsets);
/* Finally, each sanity check must pass */
i = 0;
@@ -760,6 +758,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
}
return ret;
+ out_free:
+ kvfree(offsets);
+ return ret;
}
static void
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 57fc97cda..aebdb337f 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -87,10 +87,6 @@ iptable_mangle_hook(void *priv,
{
if (state->hook == NF_INET_LOCAL_OUT)
return ipt_mangle_out(skb, state);
- if (state->hook == NF_INET_POST_ROUTING)
- return ipt_do_table(skb, state,
- state->net->ipv4.iptable_mangle);
- /* PREROUTING/INPUT/FORWARD: */
return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index c6f3c406f..63923710f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -26,6 +26,8 @@
struct ct_iter_state {
struct seq_net_private p;
+ struct hlist_nulls_head *hash;
+ unsigned int htable_size;
unsigned int bucket;
};
@@ -35,10 +37,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < nf_conntrack_htable_size;
+ st->bucket < st->htable_size;
st->bucket++) {
n = rcu_dereference(
- hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+ hlist_nulls_first_rcu(&st->hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -53,11 +55,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= nf_conntrack_htable_size)
+ if (++st->bucket >= st->htable_size)
return NULL;
}
head = rcu_dereference(
- hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+ hlist_nulls_first_rcu(&st->hash[st->bucket]));
}
return head;
}
@@ -75,7 +77,11 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
+ struct ct_iter_state *st = seq->private;
+
rcu_read_lock();
+
+ nf_conntrack_get_ht(&st->hash, &st->htable_size);
return ct_get_idx(seq, *pos);
}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index b6ea57ec5..fd8220213 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -24,6 +24,9 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
return NULL;
+ if (ip_hdr(oldskb)->protocol != IPPROTO_TCP)
+ return NULL;
+
oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
sizeof(struct tcphdr), _oth);
if (oth == NULL)
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 2375b0a8b..30493beb6 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
__be32 saddr, daddr;
u_int8_t tos;
const struct iphdr *iph;
+ int err;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
@@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv,
tos = iph->tos;
ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_QUEUE) {
+ if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
iph->daddr != daddr ||
skb->mark != mark ||
- iph->tos != tos)
- if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
- ret = NF_DROP;
+ iph->tos != tos) {
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
}
return ret;
}
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index c24f41c81..2c2553b90 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = {
.eval = nft_reject_ipv4_eval,
.init = nft_reject_init,
.dump = nft_reject_dump,
+ .validate = nft_reject_validate,
};
static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a1f2830d8..62c3ed0b7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs)
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
u32 old = ACCESS_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
- u32 delta = 0;
+ u32 new, delta = 0;
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
- return atomic_add_return(segs + delta, p_id) - segs;
+ /* Do not use atomic_add_return() as it makes UBSAN unhappy */
+ do {
+ old = (u32)atomic_read(p_id);
+ new = old + delta + segs;
+ } while (atomic_cmpxchg(p_id, old, new) != old);
+
+ return new - segs;
}
EXPORT_SYMBOL(ip_idents_reserve);
@@ -2497,7 +2503,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
int err = ipmr_get_route(net, skb,
fl4->saddr, fl4->daddr,
- r, nowait);
+ r, nowait, portid);
+
if (err <= 0) {
if (!nowait) {
if (err == 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 99fbc1479..2fa1a4a6d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2278,6 +2278,38 @@ static inline bool tcp_can_repair_sock(const struct sock *sk)
((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
}
+static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
+{
+ struct tcp_repair_window opt;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ if (len != sizeof(opt))
+ return -EINVAL;
+
+ if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ return -EFAULT;
+
+ if (opt.max_window < opt.snd_wnd)
+ return -EINVAL;
+
+ if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
+ return -EINVAL;
+
+ if (after(opt.rcv_wup, tp->rcv_nxt))
+ return -EINVAL;
+
+ tp->snd_wl1 = opt.snd_wl1;
+ tp->snd_wnd = opt.snd_wnd;
+ tp->max_window = opt.max_window;
+
+ tp->rcv_wnd = opt.rcv_wnd;
+ tp->rcv_wup = opt.rcv_wup;
+
+ return 0;
+}
+
static int tcp_repair_options_est(struct tcp_sock *tp,
struct tcp_repair_opt __user *optbuf, unsigned int len)
{
@@ -2708,6 +2740,9 @@ stealth_integrity_out_1:
else
tp->tsoffset = val - tcp_time_stamp;
break;
+ case TCP_REPAIR_WINDOW:
+ err = tcp_repair_set_window(tp, optval, optlen);
+ break;
case TCP_NOTSENT_LOWAT:
tp->notsent_lowat = val;
sk->sk_write_space(sk);
@@ -2976,6 +3011,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EINVAL;
break;
+ case TCP_REPAIR_WINDOW: {
+ struct tcp_repair_window opt;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len != sizeof(opt))
+ return -EINVAL;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ opt.snd_wl1 = tp->snd_wl1;
+ opt.snd_wnd = tp->snd_wnd;
+ opt.max_window = tp->max_window;
+ opt.rcv_wnd = tp->rcv_wnd;
+ opt.rcv_wup = tp->rcv_wup;
+
+ if (copy_to_user(optval, &opt, len))
+ return -EFAULT;
+ return 0;
+ }
case TCP_QUEUE_SEQ:
if (tp->repair_queue == TCP_SEND_QUEUE)
val = tp->write_seq;
@@ -3085,8 +3142,18 @@ static void __tcp_alloc_md5sig_pool(void)
return;
for_each_possible_cpu(cpu) {
+ void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
struct ahash_request *req;
+ if (!scratch) {
+ scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
+ sizeof(struct tcphdr),
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!scratch)
+ return;
+ per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
+ }
if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
continue;
@@ -3242,7 +3309,6 @@ int tcp_abort(struct sock *sk, int err)
local_bh_enable();
return 0;
}
- sock_gen_put(sk);
return -EOPNOTSUPP;
}
@@ -3271,7 +3337,6 @@ int tcp_abort(struct sock *sk, int err)
bh_unlock_sock(sk);
local_bh_enable();
release_sock(sk);
- sock_put(sk);
return 0;
}
EXPORT_SYMBOL_GPL(tcp_abort);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 7e538f71f..10d728b68 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -293,7 +293,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
*/
if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- memset(info, 0, sizeof(struct tcp_dctcp_info));
+ memset(&info->dctcp, 0, sizeof(info->dctcp));
if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
info->dctcp.dctcp_enabled = 1;
info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
@@ -303,7 +303,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
}
*attr = INET_DIAG_DCTCPINFO;
- return sizeof(*info);
+ return sizeof(info->dctcp);
}
return 0;
}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 4d610934f..a748c74aa 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -54,11 +54,16 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
{
struct net *net = sock_net(in_skb->sk);
struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req);
+ int err;
if (IS_ERR(sk))
return PTR_ERR(sk);
- return sock_diag_destroy(sk, ECONNABORTED);
+ err = sock_diag_destroy(sk, ECONNABORTED);
+
+ sock_gen_put(sk);
+
+ return err;
}
#endif
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 478114b36..4e777a324 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -227,6 +227,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_fastopen_add_skb(child, skb);
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
+ tp->rcv_wup = tp->rcv_nxt;
/* tcp_conn_request() is sending the SYNACK,
* and queues the child into listener accept queue.
*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cd3c4cbbb..9d8e4f7d2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2332,10 +2332,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
- &np->daddr, ntohs(inet->inet_dport),
+ &sk->sk_v6_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -3118,6 +3117,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
long ca_rtt_us = -1L;
struct sk_buff *skb;
u32 pkts_acked = 0;
+ u32 last_in_flight = 0;
bool rtt_update;
int flag = 0;
@@ -3157,6 +3157,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!first_ackt.v64)
first_ackt = last_ackt;
+ last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
reord = min(pkts_acked, reord);
if (!after(scb->end_seq, tp->high_seq))
flag |= FLAG_ORIG_SACK_ACKED;
@@ -3253,7 +3254,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (icsk->icsk_ca_ops->pkts_acked) {
struct ack_sample sample = { .pkts_acked = pkts_acked,
- .rtt_us = ca_rtt_us };
+ .rtt_us = ca_rtt_us,
+ .in_flight = last_in_flight };
icsk->icsk_ca_ops->pkts_acked(sk, &sample);
}
@@ -5247,6 +5249,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool rst_seq_match = false;
/* RFC1323: H1. Apply PAWS check first. */
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
@@ -5283,13 +5286,32 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
/* Step 2: check RST bit */
if (th->rst) {
- /* RFC 5961 3.2 :
- * If sequence number exactly matches RCV.NXT, then
+ /* RFC 5961 3.2 (extend to match against SACK too if available):
+ * If seq num matches RCV.NXT or the right-most SACK block,
+ * then
* RESET the connection
* else
* Send a challenge ACK
*/
- if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ rst_seq_match = true;
+ } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int max_sack = sp[0].end_seq;
+ int this_sack;
+
+ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
+ ++this_sack) {
+ max_sack = after(sp[this_sack].end_seq,
+ max_sack) ?
+ sp[this_sack].end_seq : max_sack;
+ }
+
+ if (TCP_SKB_CB(skb)->seq == max_sack)
+ rst_seq_match = true;
+ }
+
+ if (rst_seq_match)
tcp_reset(sk);
else
tcp_send_challenge_ack(sk, skb);
@@ -5949,7 +5971,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* so release it.
*/
if (req) {
- tp->total_retrans = req->num_retrans;
+ inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
@@ -6211,6 +6233,9 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
kmemcheck_annotate_bitfield(ireq, flags);
ireq->opt = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ ireq->pktopts = NULL;
+#endif
atomic64_set(&ireq->ir_cookie, 0);
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 70dbd19fd..ba2ab6fcf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1040,27 +1040,28 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
GFP_KERNEL);
}
-static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
- __be32 daddr, __be32 saddr, int nbytes)
+static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
+ __be32 daddr, __be32 saddr,
+ const struct tcphdr *th, int nbytes)
{
struct tcp4_pseudohdr *bp;
struct scatterlist sg;
+ struct tcphdr *_th;
- bp = &hp->md5_blk.ip4;
-
- /*
- * 1. the TCP pseudo-header (in the order: source IP address,
- * destination IP address, zero-padded protocol number, and
- * segment length)
- */
+ bp = hp->scratch;
bp->saddr = saddr;
bp->daddr = daddr;
bp->pad = 0;
bp->protocol = IPPROTO_TCP;
bp->len = cpu_to_be16(nbytes);
- sg_init_one(&sg, bp, sizeof(*bp));
- ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+ _th = (struct tcphdr *)(bp + 1);
+ memcpy(_th, th, sizeof(*th));
+ _th->check = 0;
+
+ sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+ sizeof(*bp) + sizeof(*th));
return crypto_ahash_update(hp->md5_req);
}
@@ -1077,9 +1078,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
if (crypto_ahash_init(req))
goto clear_hash;
- if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_header(hp, th))
+ if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
@@ -1123,9 +1122,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
if (crypto_ahash_init(req))
goto clear_hash;
- if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
- goto clear_hash;
- if (tcp_md5_hash_header(hp, th))
+ if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
goto clear_hash;
if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
goto clear_hash;
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
new file mode 100644
index 000000000..5de82a8d4
--- /dev/null
+++ b/net/ipv4/tcp_nv.c
@@ -0,0 +1,476 @@
+/*
+ * TCP NV: TCP with Congestion Avoidance
+ *
+ * TCP-NV is a successor of TCP-Vegas that has been developed to
+ * deal with the issues that occur in modern networks.
+ * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
+ * the ability to detect congestion before packet losses occur.
+ * When congestion (queue buildup) starts to occur, TCP-NV
+ * predicts what the cwnd size should be for the current
+ * throughput and it reduces the cwnd proportionally to
+ * the difference between the current cwnd and the predicted cwnd.
+ *
+ * NV is only recommeneded for traffic within a data center, and when
+ * all the flows are NV (at least those within the data center). This
+ * is due to the inherent unfairness between flows using losses to
+ * detect congestion (congestion control) and those that use queue
+ * buildup to detect congestion (congestion avoidance).
+ *
+ * Note: High NIC coalescence values may lower the performance of NV
+ * due to the increased noise in RTT values. In particular, we have
+ * seen issues with rx-frames values greater than 8.
+ *
+ * TODO:
+ * 1) Add mechanism to deal with reverse congestion.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+/* TCP NV parameters
+ *
+ * nv_pad Max number of queued packets allowed in network
+ * nv_pad_buffer Do not grow cwnd if this closed to nv_pad
+ * nv_reset_period How often (in) seconds)to reset min_rtt
+ * nv_min_cwnd Don't decrease cwnd below this if there are no losses
+ * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected
+ * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8
+ * nv_rtt_factor RTT averaging factor
+ * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur
+ * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd
+ * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd
+ * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping
+ * slow-start due to congestion
+ * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion
+ * nv_rtt_min_cnt Wait these many RTTs before making congesion decision
+ * nv_cwnd_growth_rate_neg
+ * nv_cwnd_growth_rate_pos
+ * How quickly to double growth rate (not rate) of cwnd when not
+ * congested. One value (nv_cwnd_growth_rate_neg) for when
+ * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos)
+ * otherwise.
+ */
+
+static int nv_pad __read_mostly = 10;
+static int nv_pad_buffer __read_mostly = 2;
+static int nv_reset_period __read_mostly = 5; /* in seconds */
+static int nv_min_cwnd __read_mostly = 2;
+static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */
+static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */
+static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */
+static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */
+static int nv_cwnd_growth_rate_neg __read_mostly = 8;
+static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */
+static int nv_dec_eval_min_calls __read_mostly = 60;
+static int nv_inc_eval_min_calls __read_mostly = 20;
+static int nv_ssthresh_eval_min_calls __read_mostly = 30;
+static int nv_stop_rtt_cnt __read_mostly = 10;
+static int nv_rtt_min_cnt __read_mostly = 2;
+
+module_param(nv_pad, int, 0644);
+MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network");
+module_param(nv_reset_period, int, 0644);
+MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
+module_param(nv_min_cwnd, int, 0644);
+MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
+ " without losses");
+
+/* TCP NV Parameters */
+struct tcpnv {
+ unsigned long nv_min_rtt_reset_jiffies; /* when to switch to
+ * nv_min_rtt_new */
+ s8 cwnd_growth_factor; /* Current cwnd growth factor,
+ * < 0 => less than 1 packet/RTT */
+ u8 available8;
+ u16 available16;
+ u32 loss_cwnd; /* cwnd at last loss */
+ u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */
+ nv_reset:1, /* whether to reset values */
+ nv_catchup:1; /* whether we are growing because
+ * of temporary cwnd decrease */
+ u8 nv_eval_call_cnt; /* call count since last eval */
+ u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is
+ * smaller than this. It may grow to handle
+ * TSO, LRO and interrupt coalescence because
+ * with these a small cwnd cannot saturate
+ * the link. Note that this is different from
+ * the file local nv_min_cwnd */
+ u8 nv_rtt_cnt; /* RTTs without making ca decision */;
+ u32 nv_last_rtt; /* last rtt */
+ u32 nv_min_rtt; /* active min rtt. Used to determine slope */
+ u32 nv_min_rtt_new; /* min rtt for future use */
+ u32 nv_rtt_max_rate; /* max rate seen during current RTT */
+ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives
+ * acking beyond nv_rtt_start_seq */
+ u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is
+ * used to determine bytes acked since last
+ * call to bictcp_acked */
+ u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */
+};
+
+#define NV_INIT_RTT U32_MAX
+#define NV_MIN_CWND 4
+#define NV_MIN_CWND_GROW 2
+#define NV_TSO_CWND_BOUND 80
+
+static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->nv_reset = 0;
+ ca->loss_cwnd = 0;
+ ca->nv_no_cong_cnt = 0;
+ ca->nv_rtt_cnt = 0;
+ ca->nv_last_rtt = 0;
+ ca->nv_rtt_max_rate = 0;
+ ca->nv_rtt_start_seq = tp->snd_una;
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_last_snd_una = tp->snd_una;
+}
+
+static void tcpnv_init(struct sock *sk)
+{
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ tcpnv_reset(ca, sk);
+
+ ca->nv_allow_cwnd_growth = 1;
+ ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ;
+ ca->nv_min_rtt = NV_INIT_RTT;
+ ca->nv_min_rtt_new = NV_INIT_RTT;
+ ca->nv_min_cwnd = NV_MIN_CWND;
+ ca->nv_catchup = 0;
+ ca->cwnd_growth_factor = 0;
+}
+
+static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpnv *ca = inet_csk_ca(sk);
+ u32 cnt;
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* Only grow cwnd if NV has not detected congestion */
+ if (!ca->nv_allow_cwnd_growth)
+ return;
+
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+
+ if (ca->cwnd_growth_factor < 0) {
+ cnt = tp->snd_cwnd << -ca->cwnd_growth_factor;
+ tcp_cong_avoid_ai(tp, cnt, acked);
+ } else {
+ cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor);
+ tcp_cong_avoid_ai(tp, cnt, acked);
+ }
+}
+
+static u32 tcpnv_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ ca->loss_cwnd = tp->snd_cwnd;
+ return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
+}
+
+static u32 tcpnv_undo_cwnd(struct sock *sk)
+{
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+static void tcpnv_state(struct sock *sk, u8 new_state)
+{
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Open && ca->nv_reset) {
+ tcpnv_reset(ca, sk);
+ } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR ||
+ new_state == TCP_CA_Recovery) {
+ ca->nv_reset = 1;
+ ca->nv_allow_cwnd_growth = 0;
+ if (new_state == TCP_CA_Loss) {
+ /* Reset cwnd growth factor to Reno value */
+ if (ca->cwnd_growth_factor > 0)
+ ca->cwnd_growth_factor = 0;
+ /* Decrease growth rate if allowed */
+ if (nv_cwnd_growth_rate_neg > 0 &&
+ ca->cwnd_growth_factor > -8)
+ ca->cwnd_growth_factor--;
+ }
+ }
+}
+
+/* Do congestion avoidance calculations for TCP-NV
+ */
+static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpnv *ca = inet_csk_ca(sk);
+ unsigned long now = jiffies;
+ s64 rate64 = 0;
+ u32 rate, max_win, cwnd_by_slope;
+ u32 avg_rtt;
+ u32 bytes_acked = 0;
+
+ /* Some calls are for duplicates without timetamps */
+ if (sample->rtt_us < 0)
+ return;
+
+ /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_Disorder)
+ return;
+
+ /* Stop cwnd growth if we were in catch up mode */
+ if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) {
+ ca->nv_catchup = 0;
+ ca->nv_allow_cwnd_growth = 0;
+ }
+
+ bytes_acked = tp->snd_una - ca->nv_last_snd_una;
+ ca->nv_last_snd_una = tp->snd_una;
+
+ if (sample->in_flight == 0)
+ return;
+
+ /* Calculate moving average of RTT */
+ if (nv_rtt_factor > 0) {
+ if (ca->nv_last_rtt > 0) {
+ avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor +
+ ((u64)ca->nv_last_rtt)
+ * (256 - nv_rtt_factor)) >> 8;
+ } else {
+ avg_rtt = sample->rtt_us;
+ ca->nv_min_rtt = avg_rtt << 1;
+ }
+ ca->nv_last_rtt = avg_rtt;
+ } else {
+ avg_rtt = sample->rtt_us;
+ }
+
+ /* rate in 100's bits per second */
+ rate64 = ((u64)sample->in_flight) * 8000000;
+ rate = (u32)div64_u64(rate64, (u64)(avg_rtt * 100));
+
+ /* Remember the maximum rate seen during this RTT
+ * Note: It may be more than one RTT. This function should be
+ * called at least nv_dec_eval_min_calls times.
+ */
+ if (ca->nv_rtt_max_rate < rate)
+ ca->nv_rtt_max_rate = rate;
+
+ /* We have valid information, increment counter */
+ if (ca->nv_eval_call_cnt < 255)
+ ca->nv_eval_call_cnt++;
+
+ /* update min rtt if necessary */
+ if (avg_rtt < ca->nv_min_rtt)
+ ca->nv_min_rtt = avg_rtt;
+
+ /* update future min_rtt if necessary */
+ if (avg_rtt < ca->nv_min_rtt_new)
+ ca->nv_min_rtt_new = avg_rtt;
+
+ /* nv_min_rtt is updated with the minimum (possibley averaged) rtt
+ * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
+ * warm reset). This new nv_min_rtt will be continued to be updated
+ * and be used for another sysctl_tcp_nv_reset_period seconds,
+ * when it will be updated again.
+ * In practice we introduce some randomness, so the actual period used
+ * is chosen randomly from the range:
+ * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
+ */
+ if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
+ unsigned char rand;
+
+ ca->nv_min_rtt = ca->nv_min_rtt_new;
+ ca->nv_min_rtt_new = NV_INIT_RTT;
+ get_random_bytes(&rand, 1);
+ ca->nv_min_rtt_reset_jiffies =
+ now + ((nv_reset_period * (384 + rand) * HZ) >> 9);
+ /* Every so often we decrease ca->nv_min_cwnd in case previous
+ * value is no longer accurate.
+ */
+ ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND);
+ }
+
+ /* Once per RTT check if we need to do congestion avoidance */
+ if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
+ ca->nv_rtt_start_seq = tp->snd_nxt;
+ if (ca->nv_rtt_cnt < 0xff)
+ /* Increase counter for RTTs without CA decision */
+ ca->nv_rtt_cnt++;
+
+ /* If this function is only called once within an RTT
+ * the cwnd is probably too small (in some cases due to
+ * tso, lro or interrupt coalescence), so we increase
+ * ca->nv_min_cwnd.
+ */
+ if (ca->nv_eval_call_cnt == 1 &&
+ bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache &&
+ ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) {
+ ca->nv_min_cwnd = min(ca->nv_min_cwnd
+ + NV_MIN_CWND_GROW,
+ NV_TSO_CWND_BOUND + 1);
+ ca->nv_rtt_start_seq = tp->snd_nxt +
+ ca->nv_min_cwnd * tp->mss_cache;
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_allow_cwnd_growth = 1;
+ return;
+ }
+
+ /* Find the ideal cwnd for current rate from slope
+ * slope = 80000.0 * mss / nv_min_rtt
+ * cwnd_by_slope = nv_rtt_max_rate / slope
+ */
+ cwnd_by_slope = (u32)
+ div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
+ (u64)(80000 * tp->mss_cache));
+ max_win = cwnd_by_slope + nv_pad;
+
+ /* If cwnd > max_win, decrease cwnd
+ * if cwnd < max_win, grow cwnd
+ * else leave the same
+ */
+ if (tp->snd_cwnd > max_win) {
+ /* there is congestion, check that it is ok
+ * to make a CA decision
+ * 1. We should have at least nv_dec_eval_min_calls
+ * data points before making a CA decision
+ * 2. We only make a congesion decision after
+ * nv_rtt_min_cnt RTTs
+ */
+ if (ca->nv_rtt_cnt < nv_rtt_min_cnt) {
+ return;
+ } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
+ if (ca->nv_eval_call_cnt <
+ nv_ssthresh_eval_min_calls)
+ return;
+ /* otherwise we will decrease cwnd */
+ } else if (ca->nv_eval_call_cnt <
+ nv_dec_eval_min_calls) {
+ if (ca->nv_allow_cwnd_growth &&
+ ca->nv_rtt_cnt > nv_stop_rtt_cnt)
+ ca->nv_allow_cwnd_growth = 0;
+ return;
+ }
+
+ /* We have enough data to determine we are congested */
+ ca->nv_allow_cwnd_growth = 0;
+ tp->snd_ssthresh =
+ (nv_ssthresh_factor * max_win) >> 3;
+ if (tp->snd_cwnd - max_win > 2) {
+ /* gap > 2, we do exponential cwnd decrease */
+ int dec;
+
+ dec = max(2U, ((tp->snd_cwnd - max_win) *
+ nv_cong_dec_mult) >> 7);
+ tp->snd_cwnd -= dec;
+ } else if (nv_cong_dec_mult > 0) {
+ tp->snd_cwnd = max_win;
+ }
+ if (ca->cwnd_growth_factor > 0)
+ ca->cwnd_growth_factor = 0;
+ ca->nv_no_cong_cnt = 0;
+ } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) {
+ /* There is no congestion, grow cwnd if allowed*/
+ if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls)
+ return;
+
+ ca->nv_allow_cwnd_growth = 1;
+ ca->nv_no_cong_cnt++;
+ if (ca->cwnd_growth_factor < 0 &&
+ nv_cwnd_growth_rate_neg > 0 &&
+ ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) {
+ ca->cwnd_growth_factor++;
+ ca->nv_no_cong_cnt = 0;
+ } else if (ca->cwnd_growth_factor >= 0 &&
+ nv_cwnd_growth_rate_pos > 0 &&
+ ca->nv_no_cong_cnt >
+ nv_cwnd_growth_rate_pos) {
+ ca->cwnd_growth_factor++;
+ ca->nv_no_cong_cnt = 0;
+ }
+ } else {
+ /* cwnd is in-between, so do nothing */
+ return;
+ }
+
+ /* update state */
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_rtt_cnt = 0;
+ ca->nv_rtt_max_rate = 0;
+
+ /* Don't want to make cwnd < nv_min_cwnd
+ * (it wasn't before, if it is now is because nv
+ * decreased it).
+ */
+ if (tp->snd_cwnd < nv_min_cwnd)
+ tp->snd_cwnd = nv_min_cwnd;
+ }
+}
+
+/* Extract info for Tcp socket info provided via netlink */
+size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct tcpnv *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
+ info->vegas.tcpv_rtt = ca->nv_last_rtt;
+ info->vegas.tcpv_minrtt = ca->nv_min_rtt;
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcpnv_get_info);
+
+static struct tcp_congestion_ops tcpnv __read_mostly = {
+ .init = tcpnv_init,
+ .ssthresh = tcpnv_recalc_ssthresh,
+ .cong_avoid = tcpnv_cong_avoid,
+ .set_state = tcpnv_state,
+ .undo_cwnd = tcpnv_undo_cwnd,
+ .pkts_acked = tcpnv_acked,
+ .get_info = tcpnv_get_info,
+
+ .owner = THIS_MODULE,
+ .name = "nv",
+};
+
+static int __init tcpnv_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
+
+ return tcp_register_congestion_control(&tcpnv);
+}
+
+static void __exit tcpnv_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcpnv);
+}
+
+module_init(tcpnv_register);
+module_exit(tcpnv_unregister);
+
+MODULE_AUTHOR("Lawrence Brakmo");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP NV");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 30a7dd4f6..ac98740c5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -912,9 +912,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
+ tp = tcp_sk(sk);
if (clone_it) {
skb_mstamp_get(&skb->skb_mstamp);
+ TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
+ - tp->snd_una;
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
@@ -925,7 +928,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
inet = inet_sk(sk);
- tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
@@ -1971,12 +1973,14 @@ static int tcp_mtu_probe(struct sock *sk)
len = 0;
tcp_for_write_queue_from_safe(skb, next, sk) {
copy = min_t(int, skb->len, probe_size - len);
- if (nskb->ip_summed)
+ if (nskb->ip_summed) {
skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
- else
- nskb->csum = skb_copy_and_csum_bits(skb, 0,
- skb_put(nskb, copy),
- copy, nskb->csum);
+ } else {
+ __wsum csum = skb_copy_and_csum_bits(skb, 0,
+ skb_put(nskb, copy),
+ copy, 0);
+ nskb->csum = csum_block_add(nskb->csum, csum, len);
+ }
if (skb->len <= copy) {
/* We've eaten all the data from this skb.
@@ -2610,7 +2614,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
* copying overhead: fragmentation, tunneling, mangling etc.
*/
if (atomic_read(&sk->sk_wmem_alloc) >
- min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+ min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
+ sk->sk_sndbuf))
return -EAGAIN;
if (skb_still_in_host_queue(sk, skb))
@@ -2835,7 +2840,7 @@ begin_fwd:
if (tcp_retransmit_skb(sk, skb, segs))
return;
- NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
@@ -3580,6 +3585,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ if (unlikely(tcp_passive_fastopen(sk)))
+ tcp_sk(sk)->total_retrans++;
}
return res;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index debdd8b33..f712b411f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -24,6 +24,13 @@
int sysctl_tcp_thin_linear_timeouts __read_mostly;
+/**
+ * tcp_write_err() - close socket and save error info
+ * @sk: The socket the error has appeared on.
+ *
+ * Returns: Nothing (void)
+ */
+
static void tcp_write_err(struct sock *sk)
{
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -33,16 +40,21 @@ static void tcp_write_err(struct sock *sk)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}
-/* Do not allow orphaned sockets to eat all our resources.
- * This is direct violation of TCP specs, but it is required
- * to prevent DoS attacks. It is called when a retransmission timeout
- * or zero probe timeout occurs on orphaned socket.
+/**
+ * tcp_out_of_resources() - Close socket if out of resources
+ * @sk: pointer to current socket
+ * @do_reset: send a last packet with reset flag
*
- * Criteria is still not confirmed experimentally and may change.
- * We kill the socket, if:
- * 1. If number of orphaned sockets exceeds an administratively configured
- * limit.
- * 2. If we have strong memory pressure.
+ * Do not allow orphaned sockets to eat all our resources.
+ * This is direct violation of TCP specs, but it is required
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
+ * Criteria is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ * limit.
+ * 2. If we have strong memory pressure.
*/
static int tcp_out_of_resources(struct sock *sk, bool do_reset)
{
@@ -74,7 +86,11 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
return 0;
}
-/* Calculate maximal number or retries on an orphaned socket. */
+/**
+ * tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket
+ * @sk: Pointer to the current socket.
+ * @alive: bool, socket alive state
+ */
static int tcp_orphan_retries(struct sock *sk, bool alive)
{
int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
@@ -115,10 +131,22 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
}
}
-/* This function calculates a "timeout" which is equivalent to the timeout of a
- * TCP connection after "boundary" unsuccessful, exponentially backed-off
+
+/**
+ * retransmits_timed_out() - returns true if this connection has timed out
+ * @sk: The current socket
+ * @boundary: max number of retransmissions
+ * @timeout: A custom timeout value.
+ * If set to 0 the default timeout is calculated and used.
+ * Using TCP_RTO_MIN and the number of unsuccessful retransmits.
+ * @syn_set: true if the SYN Bit was set.
+ *
+ * The default "timeout" value this function can calculate and use
+ * is equivalent to the timeout of a TCP Connection
+ * after "boundary" unsuccessful, exponentially backed-off
* retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
* syn_set flag is set.
+ *
*/
static bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
@@ -257,6 +285,16 @@ out:
sk_mem_reclaim(sk);
}
+
+/**
+ * tcp_delack_timer() - The TCP delayed ACK timeout handler
+ * @data: Pointer to the current socket. (gets casted to struct sock *)
+ *
+ * This function gets (indirectly) called when the kernel timer for a TCP packet
+ * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work.
+ *
+ * Returns: Nothing (void)
+ */
static void tcp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
@@ -346,14 +384,23 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
*/
inet_rtx_syn_ack(sk, req);
req->num_timeout++;
+ icsk->icsk_retransmits++;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
}
-/*
- * The TCP retransmit timer.
- */
+/**
+ * tcp_retransmit_timer() - The TCP retransmit timeout handler
+ * @sk: Pointer to the current socket.
+ *
+ * This function gets called when the kernel timer for a TCP packet
+ * of this socket expires.
+ *
+ * It handles retransmission, timer adjustment and other necesarry measures.
+ *
+ * Returns: Nothing (void)
+ */
void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -494,7 +541,8 @@ out_reset_timer:
out:;
}
-/* Called with BH disabled */
+/* Called with bottom-half processing disabled.
+ Called by tcp_write_timer() */
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -539,7 +587,7 @@ static void tcp_write_timer(unsigned long data)
if (!sock_owned_by_user(sk)) {
tcp_write_timer_handler(sk);
} else {
- /* deleguate our work to tcp_release_cb() */
+ /* delegate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
sock_hold(sk);
}
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 0d0171830..ec35eaa5c 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -6,6 +6,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/mpls.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
@@ -16,11 +17,14 @@
static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly;
static DEFINE_MUTEX(tunnel4_mutex);
static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
{
- return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
+ return (family == AF_INET) ? &tunnel4_handlers :
+ (family == AF_INET6) ? &tunnel64_handlers :
+ &tunnelmpls4_handlers;
}
int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
@@ -125,6 +129,26 @@ drop:
}
#endif
+#if IS_ENABLED(CONFIG_MPLS)
+static int tunnelmpls4_rcv(struct sk_buff *skb)
+{
+ struct xfrm_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct mpls_label)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
static void tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
@@ -145,6 +169,17 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
}
#endif
+#if IS_ENABLED(CONFIG_MPLS)
+static void tunnelmpls4_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+#endif
+
static const struct net_protocol tunnel4_protocol = {
.handler = tunnel4_rcv,
.err_handler = tunnel4_err,
@@ -161,24 +196,47 @@ static const struct net_protocol tunnel64_protocol = {
};
#endif
+#if IS_ENABLED(CONFIG_MPLS)
+static const struct net_protocol tunnelmpls4_protocol = {
+ .handler = tunnelmpls4_rcv,
+ .err_handler = tunnelmpls4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+#endif
+
static int __init tunnel4_init(void)
{
- if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
- pr_err("%s: can't add protocol\n", __func__);
- return -EAGAIN;
- }
+ if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP))
+ goto err;
#if IS_ENABLED(CONFIG_IPV6)
if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
- pr_err("tunnel64 init: can't add protocol\n");
inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
- return -EAGAIN;
+ goto err;
+ }
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+ if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) {
+ inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+#if IS_ENABLED(CONFIG_IPV6)
+ inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
+#endif
+ goto err;
}
#endif
return 0;
+
+err:
+ pr_err("%s: can't add protocol\n", __func__);
+ return -EAGAIN;
}
static void __exit tunnel4_fini(void)
{
+#if IS_ENABLED(CONFIG_MPLS)
+ if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS))
+ pr_err("tunnelmpls4 close: can't remove protocol\n");
+#endif
#if IS_ENABLED(CONFIG_IPV6)
if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
pr_err("tunnel64 close: can't remove protocol\n");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 00d18c57c..5fdcb8d10 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2216,7 +2216,6 @@ struct proto udp_prot = {
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 47f12c73d..58bd39fb1 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -76,6 +76,67 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
+void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
+ unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct udp_tunnel_info ti;
+
+ if (!dev->netdev_ops->ndo_udp_tunnel_add)
+ return;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
+
+/* Notify netdevs that UDP port started listening */
+void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct udp_tunnel_info ti;
+ struct net_device *dev;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (!dev->netdev_ops->ndo_udp_tunnel_add)
+ continue;
+ dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
+
+/* Notify netdevs that UDP port is no more listening */
+void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct udp_tunnel_info ti;
+ struct net_device *dev;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (!dev->netdev_ops->ndo_udp_tunnel_del)
+ continue;
+ dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
+
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl,
__be16 df, __be16 src_port, __be16 dst_port,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 3b3efbda4..2eea073e2 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -55,7 +55,6 @@ struct proto udplite_prot = {
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
.obj_size = sizeof(struct udp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 7b0edb37a..41f5b504a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -29,7 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = daddr->a4;
fl4->flowi4_tos = tos;
- fl4->flowi4_oif = oif;
+ fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
if (saddr)
fl4->saddr = saddr->a4;
@@ -295,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static int __net_init xfrm4_net_sysctl_init(struct net *net)
+static __net_init int xfrm4_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -323,7 +323,7 @@ err_alloc:
return -ENOMEM;
}
-static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
+static __net_exit void xfrm4_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
@@ -336,12 +336,12 @@ static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
kfree(table);
}
#else /* CONFIG_SYSCTL */
-static int inline xfrm4_net_sysctl_init(struct net *net)
+static inline int xfrm4_net_sysctl_init(struct net *net)
{
return 0;
}
-static void inline xfrm4_net_sysctl_exit(struct net *net)
+static inline void xfrm4_net_sysctl_exit(struct net *net)
{
}
#endif
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 6d8ea0992..c174ccb34 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -22,6 +22,7 @@ ipv6-$(CONFIG_NETFILTER) += netfilter.o
ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
ipv6-$(CONFIG_PROC_FS) += proc.o
ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
+ipv6-$(CONFIG_NETLABEL) += calipso.o
ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 82e367b9e..2f1f5d439 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -547,7 +547,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex,
struct sk_buff *skb;
int err = -ENOBUFS;
- skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_ATOMIC);
+ skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_KERNEL);
if (!skb)
goto errout;
@@ -559,7 +559,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex,
kfree_skb(skb);
goto errout;
}
- rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC);
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_KERNEL);
return;
errout:
rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err);
@@ -778,7 +778,14 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
}
if (p == &net->ipv6.devconf_all->forwarding) {
+ int old_dflt = net->ipv6.devconf_dflt->forwarding;
+
net->ipv6.devconf_dflt->forwarding = newf;
+ if ((!newf) ^ (!old_dflt))
+ inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+
addrconf_forward_change(net, newf);
if ((!newf) ^ (!old))
inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING,
@@ -1524,6 +1531,28 @@ out:
return hiscore_idx;
}
+static int ipv6_get_saddr_master(struct net *net,
+ const struct net_device *dst_dev,
+ const struct net_device *master,
+ struct ipv6_saddr_dst *dst,
+ struct ipv6_saddr_score *scores,
+ int hiscore_idx)
+{
+ struct inet6_dev *idev;
+
+ idev = __in6_dev_get(dst_dev);
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
+ scores, hiscore_idx);
+
+ idev = __in6_dev_get(master);
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
+ scores, hiscore_idx);
+
+ return hiscore_idx;
+}
+
int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
const struct in6_addr *daddr, unsigned int prefs,
struct in6_addr *saddr)
@@ -1577,13 +1606,39 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
if (idev)
hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
} else {
+ const struct net_device *master;
+ int master_idx = 0;
+
+ /* if dst_dev exists and is enslaved to an L3 device, then
+ * prefer addresses from dst_dev and then the master over
+ * any other enslaved devices in the L3 domain.
+ */
+ master = l3mdev_master_dev_rcu(dst_dev);
+ if (master) {
+ master_idx = master->ifindex;
+
+ hiscore_idx = ipv6_get_saddr_master(net, dst_dev,
+ master, &dst,
+ scores, hiscore_idx);
+
+ if (scores[hiscore_idx].ifa)
+ goto out;
+ }
+
for_each_netdev_rcu(net, dev) {
+ /* only consider addresses on devices in the
+ * same L3 domain
+ */
+ if (l3mdev_master_ifindex_rcu(dev) != master_idx)
+ continue;
idev = __in6_dev_get(dev);
if (!idev)
continue;
hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
}
}
+
+out:
rcu_read_unlock();
hiscore = &scores[hiscore_idx];
@@ -1824,7 +1879,6 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
void addrconf_dad_failure(struct inet6_ifaddr *ifp)
{
- struct in6_addr addr;
struct inet6_dev *idev = ifp->idev;
struct net *net = dev_net(ifp->idev->dev);
@@ -1886,18 +1940,6 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
in6_ifa_put(ifp2);
lock_errdad:
spin_lock_bh(&ifp->lock);
- } else if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
- addr.s6_addr32[0] = htonl(0xfe800000);
- addr.s6_addr32[1] = 0;
-
- if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
- ipv6_addr_equal(&ifp->addr, &addr)) {
- /* DAD failed for link-local based on MAC address */
- idev->cnf.disable_ipv6 = 1;
-
- pr_info("%s: IPv6 being disabled!\n",
- ifp->idev->dev->name);
- }
}
errdad:
@@ -2255,7 +2297,7 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
return ERR_PTR(-EACCES);
/* Add default multicast route */
- if (!(dev->flags & IFF_LOOPBACK))
+ if (!(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev))
addrconf_add_mroute(dev);
return idev;
@@ -2334,12 +2376,109 @@ static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
}
+int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
+ const struct prefix_info *pinfo,
+ struct inet6_dev *in6_dev,
+ const struct in6_addr *addr, int addr_type,
+ u32 addr_flags, bool sllao, bool tokenized,
+ __u32 valid_lft, u32 prefered_lft)
+{
+ struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1);
+ int create = 0, update_lft = 0;
+
+ if (!ifp && valid_lft) {
+ int max_addresses = in6_dev->cnf.max_addresses;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ if (in6_dev->cnf.optimistic_dad &&
+ !net->ipv6.devconf_all->forwarding && sllao)
+ addr_flags |= IFA_F_OPTIMISTIC;
+#endif
+
+ /* Do not allow to create too much of autoconfigured
+ * addresses; this would be too easy way to crash kernel.
+ */
+ if (!max_addresses ||
+ ipv6_count_addresses(in6_dev) < max_addresses)
+ ifp = ipv6_add_addr(in6_dev, addr, NULL,
+ pinfo->prefix_len,
+ addr_type&IPV6_ADDR_SCOPE_MASK,
+ addr_flags, valid_lft,
+ prefered_lft);
+
+ if (IS_ERR_OR_NULL(ifp))
+ return -1;
+
+ update_lft = 0;
+ create = 1;
+ spin_lock_bh(&ifp->lock);
+ ifp->flags |= IFA_F_MANAGETEMPADDR;
+ ifp->cstamp = jiffies;
+ ifp->tokenized = tokenized;
+ spin_unlock_bh(&ifp->lock);
+ addrconf_dad_start(ifp);
+ }
+
+ if (ifp) {
+ u32 flags;
+ unsigned long now;
+ u32 stored_lft;
+
+ /* update lifetime (RFC2462 5.5.3 e) */
+ spin_lock_bh(&ifp->lock);
+ now = jiffies;
+ if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
+ stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
+ else
+ stored_lft = 0;
+ if (!update_lft && !create && stored_lft) {
+ const u32 minimum_lft = min_t(u32,
+ stored_lft, MIN_VALID_LIFETIME);
+ valid_lft = max(valid_lft, minimum_lft);
+
+ /* RFC4862 Section 5.5.3e:
+ * "Note that the preferred lifetime of the
+ * corresponding address is always reset to
+ * the Preferred Lifetime in the received
+ * Prefix Information option, regardless of
+ * whether the valid lifetime is also reset or
+ * ignored."
+ *
+ * So we should always update prefered_lft here.
+ */
+ update_lft = 1;
+ }
+
+ if (update_lft) {
+ ifp->valid_lft = valid_lft;
+ ifp->prefered_lft = prefered_lft;
+ ifp->tstamp = now;
+ flags = ifp->flags;
+ ifp->flags &= ~IFA_F_DEPRECATED;
+ spin_unlock_bh(&ifp->lock);
+
+ if (!(flags&IFA_F_TENTATIVE))
+ ipv6_ifa_notify(0, ifp);
+ } else
+ spin_unlock_bh(&ifp->lock);
+
+ manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft,
+ create, now);
+
+ in6_ifa_put(ifp);
+ addrconf_verify();
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr);
+
void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
{
struct prefix_info *pinfo;
__u32 valid_lft;
__u32 prefered_lft;
- int addr_type;
+ int addr_type, err;
u32 addr_flags = 0;
struct inet6_dev *in6_dev;
struct net *net = dev_net(dev);
@@ -2433,10 +2572,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
/* Try to figure out our local address for this prefix */
if (pinfo->autoconf && in6_dev->cnf.autoconf) {
- struct inet6_ifaddr *ifp;
struct in6_addr addr;
- int create = 0, update_lft = 0;
- bool tokenized = false;
+ bool tokenized = false, dev_addr_generated = false;
if (pinfo->prefix_len == 64) {
memcpy(&addr, &pinfo->prefix, 8);
@@ -2454,106 +2591,36 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
goto ok;
} else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
- in6_dev_put(in6_dev);
- return;
+ goto put;
+ } else {
+ dev_addr_generated = true;
}
goto ok;
}
net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n",
pinfo->prefix_len);
- in6_dev_put(in6_dev);
- return;
+ goto put;
ok:
+ err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
+ &addr, addr_type,
+ addr_flags, sllao,
+ tokenized, valid_lft,
+ prefered_lft);
+ if (err)
+ goto put;
- ifp = ipv6_get_ifaddr(net, &addr, dev, 1);
-
- if (!ifp && valid_lft) {
- int max_addresses = in6_dev->cnf.max_addresses;
-
-#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
- if (in6_dev->cnf.optimistic_dad &&
- !net->ipv6.devconf_all->forwarding && sllao)
- addr_flags |= IFA_F_OPTIMISTIC;
-#endif
-
- /* Do not allow to create too much of autoconfigured
- * addresses; this would be too easy way to crash kernel.
- */
- if (!max_addresses ||
- ipv6_count_addresses(in6_dev) < max_addresses)
- ifp = ipv6_add_addr(in6_dev, &addr, NULL,
- pinfo->prefix_len,
- addr_type&IPV6_ADDR_SCOPE_MASK,
- addr_flags, valid_lft,
- prefered_lft);
-
- if (IS_ERR_OR_NULL(ifp)) {
- in6_dev_put(in6_dev);
- return;
- }
-
- update_lft = 0;
- create = 1;
- spin_lock_bh(&ifp->lock);
- ifp->flags |= IFA_F_MANAGETEMPADDR;
- ifp->cstamp = jiffies;
- ifp->tokenized = tokenized;
- spin_unlock_bh(&ifp->lock);
- addrconf_dad_start(ifp);
- }
-
- if (ifp) {
- u32 flags;
- unsigned long now;
- u32 stored_lft;
-
- /* update lifetime (RFC2462 5.5.3 e) */
- spin_lock_bh(&ifp->lock);
- now = jiffies;
- if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
- stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
- else
- stored_lft = 0;
- if (!update_lft && !create && stored_lft) {
- const u32 minimum_lft = min_t(u32,
- stored_lft, MIN_VALID_LIFETIME);
- valid_lft = max(valid_lft, minimum_lft);
-
- /* RFC4862 Section 5.5.3e:
- * "Note that the preferred lifetime of the
- * corresponding address is always reset to
- * the Preferred Lifetime in the received
- * Prefix Information option, regardless of
- * whether the valid lifetime is also reset or
- * ignored."
- *
- * So we should always update prefered_lft here.
- */
- update_lft = 1;
- }
-
- if (update_lft) {
- ifp->valid_lft = valid_lft;
- ifp->prefered_lft = prefered_lft;
- ifp->tstamp = now;
- flags = ifp->flags;
- ifp->flags &= ~IFA_F_DEPRECATED;
- spin_unlock_bh(&ifp->lock);
-
- if (!(flags&IFA_F_TENTATIVE))
- ipv6_ifa_notify(0, ifp);
- } else
- spin_unlock_bh(&ifp->lock);
-
- manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft,
- create, now);
-
- in6_ifa_put(ifp);
- addrconf_verify();
- }
+ /* Ignore error case here because previous prefix add addr was
+ * successful which will be notified.
+ */
+ ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr,
+ addr_type, addr_flags, sllao,
+ tokenized, valid_lft,
+ prefered_lft,
+ dev_addr_generated);
}
inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
+put:
in6_dev_put(in6_dev);
}
@@ -2948,8 +3015,8 @@ static void init_loopback(struct net_device *dev)
}
}
-static void addrconf_add_linklocal(struct inet6_dev *idev,
- const struct in6_addr *addr, u32 flags)
+void addrconf_add_linklocal(struct inet6_dev *idev,
+ const struct in6_addr *addr, u32 flags)
{
struct inet6_ifaddr *ifp;
u32 addr_flags = flags | IFA_F_PERMANENT;
@@ -2968,6 +3035,7 @@ static void addrconf_add_linklocal(struct inet6_dev *idev,
in6_ifa_put(ifp);
}
}
+EXPORT_SYMBOL_GPL(addrconf_add_linklocal);
static bool ipv6_reserved_interfaceid(struct in6_addr address)
{
@@ -3551,8 +3619,7 @@ restart:
state = ifa->state;
ifa->state = INET6_IFADDR_STATE_DEAD;
- list_del(&ifa->if_list);
- list_add(&ifa->if_list, &del_list);
+ list_move(&ifa->if_list, &del_list);
}
spin_unlock_bh(&ifa->lock);
@@ -3749,6 +3816,7 @@ static void addrconf_dad_work(struct work_struct *w)
dad_work);
struct inet6_dev *idev = ifp->idev;
struct in6_addr mcaddr;
+ bool disable_ipv6 = false;
enum {
DAD_PROCESS,
@@ -3765,6 +3833,24 @@ static void addrconf_dad_work(struct work_struct *w)
} else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) {
action = DAD_ABORT;
ifp->state = INET6_IFADDR_STATE_POSTDAD;
+
+ if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 &&
+ !(ifp->flags & IFA_F_STABLE_PRIVACY)) {
+ struct in6_addr addr;
+
+ addr.s6_addr32[0] = htonl(0xfe800000);
+ addr.s6_addr32[1] = 0;
+
+ if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
+ ipv6_addr_equal(&ifp->addr, &addr)) {
+ /* DAD failed for link-local based on MAC */
+ idev->cnf.disable_ipv6 = 1;
+
+ pr_info("%s: IPv6 being disabled!\n",
+ ifp->idev->dev->name);
+ disable_ipv6 = true;
+ }
+ }
}
spin_unlock_bh(&ifp->lock);
@@ -3774,6 +3860,8 @@ static void addrconf_dad_work(struct work_struct *w)
} else if (action == DAD_ABORT) {
in6_ifa_hold(ifp);
addrconf_dad_stop(ifp, 1);
+ if (disable_ipv6)
+ addrconf_ifdown(idev->dev, 0);
goto out;
}
@@ -5946,7 +6034,7 @@ static const struct ctl_table addrconf_sysctl[] = {
static int __addrconf_sysctl_register(struct net *net, char *dev_name,
struct inet6_dev *idev, struct ipv6_devconf *p)
{
- int i;
+ int i, ifindex;
struct ctl_table *table;
char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
@@ -5966,6 +6054,13 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
if (!p->sysctl_header)
goto free;
+ if (!strcmp(dev_name, "all"))
+ ifindex = NETCONFA_IFINDEX_ALL;
+ else if (!strcmp(dev_name, "default"))
+ ifindex = NETCONFA_IFINDEX_DEFAULT;
+ else
+ ifindex = idev->dev->ifindex;
+ inet6_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
return 0;
free:
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index bfa86f040..b454055ba 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -60,6 +60,7 @@
#ifdef CONFIG_IPV6_TUNNEL
#include <net/ip6_tunnel.h>
#endif
+#include <net/calipso.h>
#include <asm/uaccess.h>
#include <linux/mroute6.h>
@@ -92,6 +93,12 @@ MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces");
module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444);
MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces");
+bool ipv6_mod_enabled(void)
+{
+ return disable_ipv6_mod == 0;
+}
+EXPORT_SYMBOL_GPL(ipv6_mod_enabled);
+
static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
{
const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -977,6 +984,10 @@ static int __init inet6_init(void)
if (err)
goto pingv6_fail;
+ err = calipso_init();
+ if (err)
+ goto calipso_fail;
+
#ifdef CONFIG_SYSCTL
err = ipv6_sysctl_register();
if (err)
@@ -987,8 +998,10 @@ out:
#ifdef CONFIG_SYSCTL
sysctl_fail:
- pingv6_exit();
+ calipso_exit();
#endif
+calipso_fail:
+ pingv6_exit();
pingv6_fail:
ipv6_packet_cleanup();
ipv6_packet_fail:
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
new file mode 100644
index 000000000..37ac9de71
--- /dev/null
+++ b/net/ipv6/calipso.c
@@ -0,0 +1,1475 @@
+/*
+ * CALIPSO - Common Architecture Label IPv6 Security Option
+ *
+ * This is an implementation of the CALIPSO protocol as specified in
+ * RFC 5570.
+ *
+ * Authors: Paul Moore <paul.moore@hp.com>
+ * Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/calipso.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <asm/unaligned.h>
+#include <linux/crc-ccitt.h>
+
+/* Maximium size of the calipso option including
+ * the two-byte TLV header.
+ */
+#define CALIPSO_OPT_LEN_MAX (2 + 252)
+
+/* Size of the minimum calipso option including
+ * the two-byte TLV header.
+ */
+#define CALIPSO_HDR_LEN (2 + 8)
+
+/* Maximium size of the calipso option including
+ * the two-byte TLV header and upto 3 bytes of
+ * leading pad and 7 bytes of trailing pad.
+ */
+#define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7)
+
+ /* Maximium size of u32 aligned buffer required to hold calipso
+ * option. Max of 3 initial pad bytes starting from buffer + 3.
+ * i.e. the worst case is when the previous tlv finishes on 4n + 3.
+ */
+#define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX)
+
+/* List of available DOI definitions */
+static DEFINE_SPINLOCK(calipso_doi_list_lock);
+static LIST_HEAD(calipso_doi_list);
+
+/* Label mapping cache */
+int calipso_cache_enabled = 1;
+int calipso_cache_bucketsize = 10;
+#define CALIPSO_CACHE_BUCKETBITS 7
+#define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS)
+#define CALIPSO_CACHE_REORDERLIMIT 10
+struct calipso_map_cache_bkt {
+ spinlock_t lock;
+ u32 size;
+ struct list_head list;
+};
+
+struct calipso_map_cache_entry {
+ u32 hash;
+ unsigned char *key;
+ size_t key_len;
+
+ struct netlbl_lsm_cache *lsm_data;
+
+ u32 activity;
+ struct list_head list;
+};
+
+static struct calipso_map_cache_bkt *calipso_cache;
+
+/* Label Mapping Cache Functions
+ */
+
+/**
+ * calipso_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry)
+{
+ if (entry->lsm_data)
+ netlbl_secattr_cache_free(entry->lsm_data);
+ kfree(entry->key);
+ kfree(entry);
+}
+
+/**
+ * calipso_map_cache_hash - Hashing function for the CALIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CALIPSO tag hashing function. Returns a 32-bit hash value.
+ *
+ */
+static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+/**
+ * calipso_cache_init - Initialize the CALIPSO cache
+ *
+ * Description:
+ * Initializes the CALIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file. Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int __init calipso_cache_init(void)
+{
+ u32 iter;
+
+ calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS,
+ sizeof(struct calipso_map_cache_bkt),
+ GFP_KERNEL);
+ if (!calipso_cache)
+ return -ENOMEM;
+
+ for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) {
+ spin_lock_init(&calipso_cache[iter].lock);
+ calipso_cache[iter].size = 0;
+ INIT_LIST_HEAD(&calipso_cache[iter].list);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_cache_invalidate - Invalidates the current CALIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CALIPSO cache. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+static void calipso_cache_invalidate(void)
+{
+ struct calipso_map_cache_entry *entry, *tmp_entry;
+ u32 iter;
+
+ for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) {
+ spin_lock_bh(&calipso_cache[iter].lock);
+ list_for_each_entry_safe(entry,
+ tmp_entry,
+ &calipso_cache[iter].list, list) {
+ list_del(&entry->list);
+ calipso_cache_entry_free(entry);
+ }
+ calipso_cache[iter].size = 0;
+ spin_unlock_bh(&calipso_cache[iter].lock);
+ }
+}
+
+/**
+ * calipso_cache_check - Check the CALIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key. If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes. The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ * 1. The cache entry's activity counter is incremented
+ * 2. The previous (higher ranking) entry's activity counter is decremented
+ * 3. If the difference between the two activity counters is geater than
+ * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int calipso_cache_check(const unsigned char *key,
+ u32 key_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ u32 bkt;
+ struct calipso_map_cache_entry *entry;
+ struct calipso_map_cache_entry *prev_entry = NULL;
+ u32 hash;
+
+ if (!calipso_cache_enabled)
+ return -ENOENT;
+
+ hash = calipso_map_cache_hash(key, key_len);
+ bkt = hash & (CALIPSO_CACHE_BUCKETS - 1);
+ spin_lock_bh(&calipso_cache[bkt].lock);
+ list_for_each_entry(entry, &calipso_cache[bkt].list, list) {
+ if (entry->hash == hash &&
+ entry->key_len == key_len &&
+ memcmp(entry->key, key, key_len) == 0) {
+ entry->activity += 1;
+ atomic_inc(&entry->lsm_data->refcount);
+ secattr->cache = entry->lsm_data;
+ secattr->flags |= NETLBL_SECATTR_CACHE;
+ secattr->type = NETLBL_NLTYPE_CALIPSO;
+ if (!prev_entry) {
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+ return 0;
+ }
+
+ if (prev_entry->activity > 0)
+ prev_entry->activity -= 1;
+ if (entry->activity > prev_entry->activity &&
+ entry->activity - prev_entry->activity >
+ CALIPSO_CACHE_REORDERLIMIT) {
+ __list_del(entry->list.prev, entry->list.next);
+ __list_add(&entry->list,
+ prev_entry->list.prev,
+ &prev_entry->list);
+ }
+
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+ return 0;
+ }
+ prev_entry = entry;
+ }
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+
+ return -ENOENT;
+}
+
+/**
+ * calipso_cache_add - Add an entry to the CALIPSO cache
+ * @calipso_ptr: the CALIPSO option
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CALIPSO label mapping cache. Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first. It is important to note that there is
+ * currently no checking for duplicate keys. Returns zero on success,
+ * negative values on failure. The key stored starts at calipso_ptr + 2,
+ * i.e. the type and length bytes are not stored, this corresponds to
+ * calipso_ptr[1] bytes of data.
+ *
+ */
+static int calipso_cache_add(const unsigned char *calipso_ptr,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ u32 bkt;
+ struct calipso_map_cache_entry *entry = NULL;
+ struct calipso_map_cache_entry *old_entry = NULL;
+ u32 calipso_ptr_len;
+
+ if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0)
+ return 0;
+
+ calipso_ptr_len = calipso_ptr[1];
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+ entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC);
+ if (!entry->key) {
+ ret_val = -ENOMEM;
+ goto cache_add_failure;
+ }
+ entry->key_len = calipso_ptr_len;
+ entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len);
+ atomic_inc(&secattr->cache->refcount);
+ entry->lsm_data = secattr->cache;
+
+ bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1);
+ spin_lock_bh(&calipso_cache[bkt].lock);
+ if (calipso_cache[bkt].size < calipso_cache_bucketsize) {
+ list_add(&entry->list, &calipso_cache[bkt].list);
+ calipso_cache[bkt].size += 1;
+ } else {
+ old_entry = list_entry(calipso_cache[bkt].list.prev,
+ struct calipso_map_cache_entry, list);
+ list_del(&old_entry->list);
+ list_add(&entry->list, &calipso_cache[bkt].list);
+ calipso_cache_entry_free(old_entry);
+ }
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+
+ return 0;
+
+cache_add_failure:
+ if (entry)
+ calipso_cache_entry_free(entry);
+ return ret_val;
+}
+
+/* DOI List Functions
+ */
+
+/**
+ * calipso_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct calipso_doi *calipso_doi_search(u32 doi)
+{
+ struct calipso_doi *iter;
+
+ list_for_each_entry_rcu(iter, &calipso_doi_list, list)
+ if (iter->doi == doi && atomic_read(&iter->refcount))
+ return iter;
+ return NULL;
+}
+
+/**
+ * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CALIPSO engine and calls this
+ * function to add it to the list of acceptable domains. The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see calipso.h for details). Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+static int calipso_doi_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -EINVAL;
+ u32 doi;
+ u32 doi_type;
+ struct audit_buffer *audit_buf;
+
+ doi = doi_def->doi;
+ doi_type = doi_def->type;
+
+ if (doi_def->doi == CALIPSO_DOI_UNKNOWN)
+ goto doi_add_return;
+
+ atomic_set(&doi_def->refcount, 1);
+
+ spin_lock(&calipso_doi_list_lock);
+ if (calipso_doi_search(doi_def->doi)) {
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = -EEXIST;
+ goto doi_add_return;
+ }
+ list_add_tail_rcu(&doi_def->list, &calipso_doi_list);
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = 0;
+
+doi_add_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info);
+ if (audit_buf) {
+ const char *type_str;
+
+ switch (doi_type) {
+ case CALIPSO_MAP_PASS:
+ type_str = "pass";
+ break;
+ default:
+ type_str = "(unknown)";
+ }
+ audit_log_format(audit_buf,
+ " calipso_doi=%u calipso_type=%s res=%u",
+ doi, type_str, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * calipso_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+static void calipso_doi_free(struct calipso_doi *doi_def)
+{
+ kfree(doi_def);
+}
+
+/**
+ * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void calipso_doi_free_rcu(struct rcu_head *entry)
+{
+ struct calipso_doi *doi_def;
+
+ doi_def = container_of(entry, struct calipso_doi, rcu);
+ calipso_doi_free(doi_def);
+}
+
+/**
+ * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct calipso_doi *doi_def;
+ struct audit_buffer *audit_buf;
+
+ spin_lock(&calipso_doi_list_lock);
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def) {
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = -ENOENT;
+ goto doi_remove_return;
+ }
+ if (!atomic_dec_and_test(&doi_def->refcount)) {
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = -EBUSY;
+ goto doi_remove_return;
+ }
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&calipso_doi_list_lock);
+
+ call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+ ret_val = 0;
+
+doi_remove_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info);
+ if (audit_buf) {
+ audit_log_format(audit_buf,
+ " calipso_doi=%u res=%u",
+ doi, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * calipso_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller. Otherwise NULL is returned. The caller must ensure that
+ * calipso_doi_putdef() is called when the caller is done.
+ *
+ */
+static struct calipso_doi *calipso_doi_getdef(u32 doi)
+{
+ struct calipso_doi *doi_def;
+
+ rcu_read_lock();
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def)
+ goto doi_getdef_return;
+ if (!atomic_inc_not_zero(&doi_def->refcount))
+ doi_def = NULL;
+
+doi_getdef_return:
+ rcu_read_unlock();
+ return doi_def;
+}
+
+/**
+ * calipso_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from calipso_doi_getdef().
+ *
+ */
+static void calipso_doi_putdef(struct calipso_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ if (!atomic_dec_and_test(&doi_def->refcount))
+ return;
+ spin_lock(&calipso_doi_list_lock);
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&calipso_doi_list_lock);
+
+ call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+}
+
+/**
+ * calipso_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return. Updates the value in @skip_cnt upon
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+static int calipso_doi_walk(u32 *skip_cnt,
+ int (*callback)(struct calipso_doi *doi_def,
+ void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOENT;
+ u32 doi_cnt = 0;
+ struct calipso_doi *iter_doi;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list)
+ if (atomic_read(&iter_doi->refcount) > 0) {
+ if (doi_cnt++ < *skip_cnt)
+ continue;
+ ret_val = callback(iter_doi, cb_arg);
+ if (ret_val < 0) {
+ doi_cnt--;
+ goto doi_walk_return;
+ }
+ }
+
+doi_walk_return:
+ rcu_read_unlock();
+ *skip_cnt = doi_cnt;
+ return ret_val;
+}
+
+/**
+ * calipso_validate - Validate a CALIPSO option
+ * @skb: the packet
+ * @option: the start of the option
+ *
+ * Description:
+ * This routine is called to validate a CALIPSO option.
+ * If the option is valid then %true is returned, otherwise
+ * %false is returned.
+ *
+ * The caller should have already checked that the length of the
+ * option (including the TLV header) is >= 10 and that the catmap
+ * length is consistent with the option length.
+ *
+ * We leave checks on the level and categories to the socket layer.
+ */
+bool calipso_validate(const struct sk_buff *skb, const unsigned char *option)
+{
+ struct calipso_doi *doi_def;
+ bool ret_val;
+ u16 crc, len = option[1] + 2;
+ static const u8 zero[2];
+
+ /* The original CRC runs over the option including the TLV header
+ * with the CRC-16 field (at offset 8) zeroed out. */
+ crc = crc_ccitt(0xffff, option, 8);
+ crc = crc_ccitt(crc, zero, sizeof(zero));
+ if (len > 10)
+ crc = crc_ccitt(crc, option + 10, len - 10);
+ crc = ~crc;
+ if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff))
+ return false;
+
+ rcu_read_lock();
+ doi_def = calipso_doi_search(get_unaligned_be32(option + 2));
+ ret_val = !!doi_def;
+ rcu_read_unlock();
+
+ return ret_val;
+}
+
+/**
+ * calipso_map_cat_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CALIPSO format
+ * @net_cat_len: the length of the CALIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CALIPSO bitmap using the given DOI definition. Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int calipso_map_cat_hton(const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int spot = -1;
+ u32 net_spot_max = 0;
+ u32 net_clen_bits = net_cat_len * 8;
+
+ for (;;) {
+ spot = netlbl_catmap_walk(secattr->attr.mls.cat,
+ spot + 1);
+ if (spot < 0)
+ break;
+ if (spot >= net_clen_bits)
+ return -ENOSPC;
+ netlbl_bitmap_setbit(net_cat, spot, 1);
+
+ if (spot > net_spot_max)
+ net_spot_max = spot;
+ }
+
+ return (net_spot_max / 32 + 1) * 4;
+}
+
+/**
+ * calipso_map_cat_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CALIPSO format
+ * @net_cat_len: the length of the CALIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CALIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ int spot = -1;
+ u32 net_clen_bits = net_cat_len * 8;
+
+ for (;;) {
+ spot = netlbl_bitmap_walk(net_cat,
+ net_clen_bits,
+ spot + 1,
+ 1);
+ if (spot < 0) {
+ if (spot == -2)
+ return -EFAULT;
+ return 0;
+ }
+
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ spot,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * calipso_pad_write - Writes pad bytes in TLV format
+ * @buf: the buffer
+ * @offset: offset from start of buffer to write padding
+ * @count: number of pad bytes to write
+ *
+ * Description:
+ * Write @count bytes of TLV padding into @buffer starting at offset @offset.
+ * @count should be less than 8 - see RFC 4942.
+ *
+ */
+static int calipso_pad_write(unsigned char *buf, unsigned int offset,
+ unsigned int count)
+{
+ if (WARN_ON_ONCE(count >= 8))
+ return -EINVAL;
+
+ switch (count) {
+ case 0:
+ break;
+ case 1:
+ buf[offset] = IPV6_TLV_PAD1;
+ break;
+ default:
+ buf[offset] = IPV6_TLV_PADN;
+ buf[offset + 1] = count - 2;
+ if (count > 2)
+ memset(buf + offset + 2, 0, count - 2);
+ break;
+ }
+ return 0;
+}
+
+/**
+ * calipso_genopt - Generate a CALIPSO option
+ * @buf: the option buffer
+ * @start: offset from which to write
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CALIPSO option using the DOI definition and security attributes
+ * passed to the function. This also generates upto three bytes of leading
+ * padding that ensures that the option is 4n + 2 aligned. It returns the
+ * number of bytes written (including any initial padding).
+ */
+static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 len, pad;
+ u16 crc;
+ static const unsigned char padding[4] = {2, 1, 0, 3};
+ unsigned char *calipso;
+
+ /* CALIPSO has 4n + 2 alignment */
+ pad = padding[start & 3];
+ if (buf_len <= start + pad + CALIPSO_HDR_LEN)
+ return -ENOSPC;
+
+ if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+ return -EPERM;
+
+ len = CALIPSO_HDR_LEN;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = calipso_map_cat_hton(doi_def,
+ secattr,
+ buf + start + pad + len,
+ buf_len - start - pad - len);
+ if (ret_val < 0)
+ return ret_val;
+ len += ret_val;
+ }
+
+ calipso_pad_write(buf, start, pad);
+ calipso = buf + start + pad;
+
+ calipso[0] = IPV6_TLV_CALIPSO;
+ calipso[1] = len - 2;
+ *(__be32 *)(calipso + 2) = htonl(doi_def->doi);
+ calipso[6] = (len - CALIPSO_HDR_LEN) / 4;
+ calipso[7] = secattr->attr.mls.lvl,
+ crc = ~crc_ccitt(0xffff, calipso, len);
+ calipso[8] = crc & 0xff;
+ calipso[9] = (crc >> 8) & 0xff;
+ return pad + len;
+}
+
+/* Hop-by-hop hdr helper functions
+ */
+
+/**
+ * calipso_opt_update - Replaces socket's hop options with a new set
+ * @sk: the socket
+ * @hop: new hop options
+ *
+ * Description:
+ * Replaces @sk's hop options with @hop. @hop may be NULL to leave
+ * the socket with no hop options.
+ *
+ */
+static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
+{
+ struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
+
+ txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS,
+ hop, hop ? ipv6_optlen(hop) : 0);
+ txopt_put(old);
+ if (IS_ERR(txopts))
+ return PTR_ERR(txopts);
+
+ txopts = ipv6_update_options(sk, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_tlv_len - Returns the length of the TLV
+ * @opt: the option header
+ * @offset: offset of the TLV within the header
+ *
+ * Description:
+ * Returns the length of the TLV option at offset @offset within
+ * the option header @opt. Checks that the entire TLV fits inside
+ * the option header, returns a negative value if this is not the case.
+ */
+static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset)
+{
+ unsigned char *tlv = (unsigned char *)opt;
+ unsigned int opt_len = ipv6_optlen(opt), tlv_len;
+
+ if (offset < sizeof(*opt) || offset >= opt_len)
+ return -EINVAL;
+ if (tlv[offset] == IPV6_TLV_PAD1)
+ return 1;
+ if (offset + 1 >= opt_len)
+ return -EINVAL;
+ tlv_len = tlv[offset + 1] + 2;
+ if (offset + tlv_len > opt_len)
+ return -EINVAL;
+ return tlv_len;
+}
+
+/**
+ * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header
+ * @hop: the hop options header
+ * @start: on return holds the offset of any leading padding
+ * @end: on return holds the offset of the first non-pad TLV after CALIPSO
+ *
+ * Description:
+ * Finds the space occupied by a CALIPSO option (including any leading and
+ * trailing padding).
+ *
+ * If a CALIPSO option exists set @start and @end to the
+ * offsets within @hop of the start of padding before the first
+ * CALIPSO option and the end of padding after the first CALIPSO
+ * option. In this case the function returns 0.
+ *
+ * In the absence of a CALIPSO option, @start and @end will be
+ * set to the start and end of any trailing padding in the header.
+ * This is useful when appending a new option, as the caller may want
+ * to overwrite some of this padding. In this case the function will
+ * return -ENOENT.
+ */
+static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start,
+ unsigned int *end)
+{
+ int ret_val = -ENOENT, tlv_len;
+ unsigned int opt_len, offset, offset_s = 0, offset_e = 0;
+ unsigned char *opt = (unsigned char *)hop;
+
+ opt_len = ipv6_optlen(hop);
+ offset = sizeof(*hop);
+
+ while (offset < opt_len) {
+ tlv_len = calipso_tlv_len(hop, offset);
+ if (tlv_len < 0)
+ return tlv_len;
+
+ switch (opt[offset]) {
+ case IPV6_TLV_PAD1:
+ case IPV6_TLV_PADN:
+ if (offset_e)
+ offset_e = offset;
+ break;
+ case IPV6_TLV_CALIPSO:
+ ret_val = 0;
+ offset_e = offset;
+ break;
+ default:
+ if (offset_e == 0)
+ offset_s = offset;
+ else
+ goto out;
+ }
+ offset += tlv_len;
+ }
+
+out:
+ if (offset_s)
+ *start = offset_s + calipso_tlv_len(hop, offset_s);
+ else
+ *start = sizeof(*hop);
+ if (offset_e)
+ *end = offset_e + calipso_tlv_len(hop, offset_e);
+ else
+ *end = opt_len;
+
+ return ret_val;
+}
+
+/**
+ * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr
+ * @hop: the original hop options header
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Creates a new hop options header based on @hop with a
+ * CALIPSO option added to it. If @hop already contains a CALIPSO
+ * option this is overwritten, otherwise the new option is appended
+ * after any existing options. If @hop is NULL then the new header
+ * will contain just the CALIPSO option and any needed padding.
+ *
+ */
+static struct ipv6_opt_hdr *
+calipso_opt_insert(struct ipv6_opt_hdr *hop,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ unsigned int start, end, buf_len, pad, hop_len;
+ struct ipv6_opt_hdr *new;
+ int ret_val;
+
+ if (hop) {
+ hop_len = ipv6_optlen(hop);
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val && ret_val != -ENOENT)
+ return ERR_PTR(ret_val);
+ } else {
+ hop_len = 0;
+ start = sizeof(*hop);
+ end = 0;
+ }
+
+ buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD;
+ new = kzalloc(buf_len, GFP_ATOMIC);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (start > sizeof(*hop))
+ memcpy(new, hop, start);
+ ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def,
+ secattr);
+ if (ret_val < 0) {
+ kfree(new);
+ return ERR_PTR(ret_val);
+ }
+
+ buf_len = start + ret_val;
+ /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */
+ pad = ((buf_len & 4) + (end & 7)) & 7;
+ calipso_pad_write((unsigned char *)new, buf_len, pad);
+ buf_len += pad;
+
+ if (end != hop_len) {
+ memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end);
+ buf_len += hop_len - end;
+ }
+ new->nexthdr = 0;
+ new->hdrlen = buf_len / 8 - 1;
+
+ return new;
+}
+
+/**
+ * calipso_opt_del - Removes the CALIPSO option from an option header
+ * @hop: the original header
+ * @new: the new header
+ *
+ * Description:
+ * Creates a new header based on @hop without any CALIPSO option. If @hop
+ * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains
+ * no other non-padding options, it returns zero with @new set to NULL.
+ * Otherwise it returns zero, creates a new header without the CALIPSO
+ * option (and removing as much padding as possible) and returns with
+ * @new set to that header.
+ *
+ */
+static int calipso_opt_del(struct ipv6_opt_hdr *hop,
+ struct ipv6_opt_hdr **new)
+{
+ int ret_val;
+ unsigned int start, end, delta, pad, hop_len;
+
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val)
+ return ret_val;
+
+ hop_len = ipv6_optlen(hop);
+ if (start == sizeof(*hop) && end == hop_len) {
+ /* There's no other option in the header so return NULL */
+ *new = NULL;
+ return 0;
+ }
+
+ delta = (end - start) & ~7;
+ *new = kzalloc(hop_len - delta, GFP_ATOMIC);
+ if (!*new)
+ return -ENOMEM;
+
+ memcpy(*new, hop, start);
+ (*new)->hdrlen -= delta / 8;
+ pad = (end - start) & 7;
+ calipso_pad_write((unsigned char *)*new, start, pad);
+ if (end != hop_len)
+ memcpy((char *)*new + start + pad, (char *)hop + end,
+ hop_len - end);
+
+ return 0;
+}
+
+/**
+ * calipso_opt_getattr - Get the security attributes from a memory block
+ * @calipso: the CALIPSO option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @calipso and return the security attributes in @secattr.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_opt_getattr(const unsigned char *calipso,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ u32 doi, len = calipso[1], cat_len = calipso[6] * 4;
+ struct calipso_doi *doi_def;
+
+ if (cat_len + 8 > len)
+ return -EINVAL;
+
+ if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0)
+ return 0;
+
+ doi = get_unaligned_be32(calipso + 2);
+ rcu_read_lock();
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def)
+ goto getattr_return;
+
+ secattr->attr.mls.lvl = calipso[7];
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (cat_len) {
+ ret_val = calipso_map_cat_ntoh(doi_def,
+ calipso + 10,
+ cat_len,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ goto getattr_return;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ secattr->type = NETLBL_NLTYPE_CALIPSO;
+
+getattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/* sock functions.
+ */
+
+/**
+ * calipso_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CALIPSO option attached to the sock and if
+ * there is return the CALIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_sock_getattr(struct sock *sk,
+ struct netlbl_lsm_secattr *secattr)
+{
+ struct ipv6_opt_hdr *hop;
+ int opt_len, len, ret_val = -ENOMSG, offset;
+ unsigned char *opt;
+ struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+ if (!txopts || !txopts->hopopt)
+ goto done;
+
+ hop = txopts->hopopt;
+ opt = (unsigned char *)hop;
+ opt_len = ipv6_optlen(hop);
+ offset = sizeof(*hop);
+ while (offset < opt_len) {
+ len = calipso_tlv_len(hop, offset);
+ if (len < 0) {
+ ret_val = len;
+ goto done;
+ }
+ switch (opt[offset]) {
+ case IPV6_TLV_CALIPSO:
+ if (len < CALIPSO_HDR_LEN)
+ ret_val = -EINVAL;
+ else
+ ret_val = calipso_opt_getattr(&opt[offset],
+ secattr);
+ goto done;
+ default:
+ offset += len;
+ break;
+ }
+ }
+done:
+ txopt_put(txopts);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_setattr - Add a CALIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+static int calipso_sock_setattr(struct sock *sk,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct ipv6_opt_hdr *old, *new;
+ struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+ old = NULL;
+ if (txopts)
+ old = txopts->hopopt;
+
+ new = calipso_opt_insert(old, doi_def, secattr);
+ txopt_put(txopts);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ ret_val = calipso_opt_update(sk, new);
+
+ kfree(new);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_delattr - Delete the CALIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a socket, if present.
+ *
+ */
+static void calipso_sock_delattr(struct sock *sk)
+{
+ struct ipv6_opt_hdr *new_hop;
+ struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+ if (!txopts || !txopts->hopopt)
+ goto done;
+
+ if (calipso_opt_del(txopts->hopopt, &new_hop))
+ goto done;
+
+ calipso_opt_update(sk, new_hop);
+ kfree(new_hop);
+
+done:
+ txopt_put(txopts);
+}
+
+/* request sock functions.
+ */
+
+/**
+ * calipso_req_setattr - Add a CALIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+static int calipso_req_setattr(struct request_sock *req,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ struct ipv6_txoptions *txopts;
+ struct inet_request_sock *req_inet = inet_rsk(req);
+ struct ipv6_opt_hdr *old, *new;
+ struct sock *sk = sk_to_full_sk(req_to_sk(req));
+
+ if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt)
+ old = req_inet->ipv6_opt->hopopt;
+ else
+ old = NULL;
+
+ new = calipso_opt_insert(old, doi_def, secattr);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
+ new, new ? ipv6_optlen(new) : 0);
+
+ kfree(new);
+
+ if (IS_ERR(txopts))
+ return PTR_ERR(txopts);
+
+ txopts = xchg(&req_inet->ipv6_opt, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_req_delattr - Delete the CALIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a request socket, if present.
+ *
+ */
+static void calipso_req_delattr(struct request_sock *req)
+{
+ struct inet_request_sock *req_inet = inet_rsk(req);
+ struct ipv6_opt_hdr *new;
+ struct ipv6_txoptions *txopts;
+ struct sock *sk = sk_to_full_sk(req_to_sk(req));
+
+ if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt)
+ return;
+
+ if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
+ return; /* Nothing to do */
+
+ txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
+ new, new ? ipv6_optlen(new) : 0);
+
+ if (!IS_ERR(txopts)) {
+ txopts = xchg(&req_inet->ipv6_opt, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+ }
+ kfree(new);
+}
+
+/* skbuff functions.
+ */
+
+/**
+ * calipso_skbuff_optptr - Find the CALIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer
+ * to the start of the CALIPSO option on success, NULL if one if not found.
+ *
+ */
+static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb);
+ int offset;
+
+ if (ip6_hdr->nexthdr != NEXTHDR_HOP)
+ return NULL;
+
+ offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO);
+ if (offset >= 0)
+ return (unsigned char *)ip6_hdr + offset;
+
+ return NULL;
+}
+
+/**
+ * calipso_skbuff_setattr - Set the CALIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CALIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+static int calipso_skbuff_setattr(struct sk_buff *skb,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct ipv6hdr *ip6_hdr;
+ struct ipv6_opt_hdr *hop;
+ unsigned char buf[CALIPSO_MAX_BUFFER];
+ int len_delta, new_end, pad;
+ unsigned int start, end;
+
+ ip6_hdr = ipv6_hdr(skb);
+ if (ip6_hdr->nexthdr == NEXTHDR_HOP) {
+ hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val && ret_val != -ENOENT)
+ return ret_val;
+ } else {
+ start = 0;
+ end = 0;
+ }
+
+ memset(buf, 0, sizeof(buf));
+ ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr);
+ if (ret_val < 0)
+ return ret_val;
+
+ new_end = start + ret_val;
+ /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */
+ pad = ((new_end & 4) + (end & 7)) & 7;
+ len_delta = new_end - (int)end + pad;
+ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ if (ret_val < 0)
+ return ret_val;
+
+ if (len_delta) {
+ if (len_delta > 0)
+ skb_push(skb, len_delta);
+ else
+ skb_pull(skb, -len_delta);
+ memmove((char *)ip6_hdr - len_delta, ip6_hdr,
+ sizeof(*ip6_hdr) + start);
+ skb_reset_network_header(skb);
+ ip6_hdr = ipv6_hdr(skb);
+ }
+
+ hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ if (start == 0) {
+ struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf;
+
+ new_hop->nexthdr = ip6_hdr->nexthdr;
+ new_hop->hdrlen = len_delta / 8 - 1;
+ ip6_hdr->nexthdr = NEXTHDR_HOP;
+ } else {
+ hop->hdrlen += len_delta / 8;
+ }
+ memcpy((char *)hop + start, buf + (start & 3), new_end - start);
+ calipso_pad_write((unsigned char *)hop, new_end, pad);
+
+ return 0;
+}
+
+/**
+ * calipso_skbuff_delattr - Delete any CALIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CALIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int calipso_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val;
+ struct ipv6hdr *ip6_hdr;
+ struct ipv6_opt_hdr *old_hop;
+ u32 old_hop_len, start = 0, end = 0, delta, size, pad;
+
+ if (!calipso_skbuff_optptr(skb))
+ return 0;
+
+ /* since we are changing the packet we should make a copy */
+ ret_val = skb_cow(skb, skb_headroom(skb));
+ if (ret_val < 0)
+ return ret_val;
+
+ ip6_hdr = ipv6_hdr(skb);
+ old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ old_hop_len = ipv6_optlen(old_hop);
+
+ ret_val = calipso_opt_find(old_hop, &start, &end);
+ if (ret_val)
+ return ret_val;
+
+ if (start == sizeof(*old_hop) && end == old_hop_len) {
+ /* There's no other option in the header so we delete
+ * the whole thing. */
+ delta = old_hop_len;
+ size = sizeof(*ip6_hdr);
+ ip6_hdr->nexthdr = old_hop->nexthdr;
+ } else {
+ delta = (end - start) & ~7;
+ if (delta)
+ old_hop->hdrlen -= delta / 8;
+ pad = (end - start) & 7;
+ size = sizeof(*ip6_hdr) + start + pad;
+ calipso_pad_write((unsigned char *)old_hop, start, pad);
+ }
+
+ if (delta) {
+ skb_pull(skb, delta);
+ memmove((char *)ip6_hdr + delta, ip6_hdr, size);
+ skb_reset_network_header(skb);
+ }
+
+ return 0;
+}
+
+static const struct netlbl_calipso_ops ops = {
+ .doi_add = calipso_doi_add,
+ .doi_free = calipso_doi_free,
+ .doi_remove = calipso_doi_remove,
+ .doi_getdef = calipso_doi_getdef,
+ .doi_putdef = calipso_doi_putdef,
+ .doi_walk = calipso_doi_walk,
+ .sock_getattr = calipso_sock_getattr,
+ .sock_setattr = calipso_sock_setattr,
+ .sock_delattr = calipso_sock_delattr,
+ .req_setattr = calipso_req_setattr,
+ .req_delattr = calipso_req_delattr,
+ .opt_getattr = calipso_opt_getattr,
+ .skbuff_optptr = calipso_skbuff_optptr,
+ .skbuff_setattr = calipso_skbuff_setattr,
+ .skbuff_delattr = calipso_skbuff_delattr,
+ .cache_invalidate = calipso_cache_invalidate,
+ .cache_add = calipso_cache_add
+};
+
+/**
+ * calipso_init - Initialize the CALIPSO module
+ *
+ * Description:
+ * Initialize the CALIPSO module and prepare it for use. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int __init calipso_init(void)
+{
+ int ret_val;
+
+ ret_val = calipso_cache_init();
+ if (!ret_val)
+ netlbl_calipso_ops_register(&ops);
+ return ret_val;
+}
+
+void calipso_exit(void)
+{
+ netlbl_calipso_ops_register(NULL);
+ calipso_cache_invalidate();
+ kfree(calipso_cache);
+}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 8de5dd7aa..139ceb68b 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -43,6 +43,7 @@
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
+#include <net/calipso.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/xfrm.h>
#endif
@@ -603,6 +604,28 @@ drop:
return false;
}
+/* CALIPSO RFC 5570 */
+
+static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff)
+{
+ const unsigned char *nh = skb_network_header(skb);
+
+ if (nh[optoff + 1] < 8)
+ goto drop;
+
+ if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1])
+ goto drop;
+
+ if (!calipso_validate(skb, nh + optoff))
+ goto drop;
+
+ return true;
+
+drop:
+ kfree_skb(skb);
+ return false;
+}
+
static const struct tlvtype_proc tlvprochopopt_lst[] = {
{
.type = IPV6_TLV_ROUTERALERT,
@@ -612,6 +635,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = {
.type = IPV6_TLV_JUMBO,
.func = ipv6_hop_jumbo,
},
+ {
+ .type = IPV6_TLV_CALIPSO,
+ .func = ipv6_hop_calipso,
+ },
{ -1, }
};
@@ -758,6 +785,27 @@ static int ipv6_renew_option(void *ohdr,
return 0;
}
+/**
+ * ipv6_renew_options - replace a specific ext hdr with a new one.
+ *
+ * @sk: sock from which to allocate memory
+ * @opt: original options
+ * @newtype: option type to replace in @opt
+ * @newopt: new option of type @newtype to replace (user-mem)
+ * @newoptlen: length of @newopt
+ *
+ * Returns a new set of options which is a copy of @opt with the
+ * option type @newtype replaced with @newopt.
+ *
+ * @opt may be NULL, in which case a new set of options is returned
+ * containing just @newopt.
+ *
+ * @newopt may be NULL, in which case the specified option type is
+ * not copied into the new set of options.
+ *
+ * The new set of options is allocated from the socket option memory
+ * buffer of @sk.
+ */
struct ipv6_txoptions *
ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
int newtype,
@@ -830,6 +878,34 @@ out:
return ERR_PTR(err);
}
+/**
+ * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
+ *
+ * @sk: sock from which to allocate memory
+ * @opt: original options
+ * @newtype: option type to replace in @opt
+ * @newopt: new option of type @newtype to replace (kernel-mem)
+ * @newoptlen: length of @newopt
+ *
+ * See ipv6_renew_options(). The difference is that @newopt is
+ * kernel memory, rather than user memory.
+ */
+struct ipv6_txoptions *
+ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
+ int newtype, struct ipv6_opt_hdr *newopt,
+ int newoptlen)
+{
+ struct ipv6_txoptions *ret_val;
+ const mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret_val = ipv6_renew_options(sk, opt, newtype,
+ (struct ipv6_opt_hdr __user *)newopt,
+ newoptlen);
+ set_fs(old_fs);
+ return ret_val;
+}
+
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
struct ipv6_txoptions *opt)
{
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 9508a20fb..305e2ed73 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -112,7 +112,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
}
EXPORT_SYMBOL(ipv6_skip_exthdr);
-int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
+int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type)
{
const unsigned char *nh = skb_network_header(skb);
int packet_len = skb_tail_pointer(skb) - skb_network_header(skb);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index ed33abf57..5857c1fc8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -67,6 +67,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
struct net *net = rule->fr_net;
pol_lookup_t lookup = arg->lookup_ptr;
int err = 0;
+ u32 tb_id;
switch (rule->action) {
case FR_ACT_TO_TBL:
@@ -86,7 +87,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
goto discard_pkt;
}
- table = fib6_get_table(net, rule->table);
+ tb_id = fib_rule_get_table(rule, arg);
+ table = fib6_get_table(net, tb_id);
if (!table) {
err = -EAGAIN;
goto out;
@@ -199,7 +201,7 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
struct fib6_rule *rule6 = (struct fib6_rule *) rule;
- if (rule->action == FR_ACT_TO_TBL) {
+ if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
if (rule->table == RT6_TABLE_UNSPEC)
goto errout;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index a4fa84076..bd59c343d 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -388,7 +388,8 @@ relookup_failed:
/*
* Send an ICMP message in response to a packet in error
*/
-static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+ const struct in6_addr *force_saddr)
{
struct net *net = dev_net(skb->dev);
struct inet6_dev *idev = NULL;
@@ -475,6 +476,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.daddr = hdr->saddr;
+ if (force_saddr)
+ saddr = force_saddr;
if (saddr)
fl6.saddr = *saddr;
fl6.flowi6_mark = mark;
@@ -502,12 +505,14 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
+ ipc6.tclass = np->tclass;
+ fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+
dst = icmpv6_route_lookup(net, skb, sk, &fl6);
if (IS_ERR(dst))
goto out;
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- ipc6.tclass = np->tclass;
ipc6.dontfrag = np->dontfrag;
ipc6.opt = NULL;
@@ -549,10 +554,75 @@ out:
*/
void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
{
- icmp6_send(skb, ICMPV6_PARAMPROB, code, pos);
+ icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL);
kfree_skb(skb);
}
+/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
+ * if sufficient data bytes are available
+ * @nhs is the size of the tunnel header(s) :
+ * Either an IPv4 header for SIT encap
+ * an IPv4 header + GRE header for GRE encap
+ */
+int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
+ unsigned int data_len)
+{
+ struct in6_addr temp_saddr;
+ struct rt6_info *rt;
+ struct sk_buff *skb2;
+ u32 info = 0;
+
+ if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8))
+ return 1;
+
+ /* RFC 4884 (partial) support for ICMP extensions */
+ if (data_len < 128 || (data_len & 7) || skb->len < data_len)
+ data_len = 0;
+
+ skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb2)
+ return 1;
+
+ skb_dst_drop(skb2);
+ skb_pull(skb2, nhs);
+ skb_reset_network_header(skb2);
+
+ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0);
+
+ if (rt && rt->dst.dev)
+ skb2->dev = rt->dst.dev;
+
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr);
+
+ if (data_len) {
+ /* RFC 4884 (partial) support :
+ * insert 0 padding at the end, before the extensions
+ */
+ __skb_push(skb2, nhs);
+ skb_reset_network_header(skb2);
+ memmove(skb2->data, skb2->data + nhs, data_len - nhs);
+ memset(skb2->data + data_len - nhs, 0, nhs);
+ /* RFC 4884 4.5 : Length is measured in 64-bit words,
+ * and stored in reserved[0]
+ */
+ info = (data_len/8) << 24;
+ }
+ if (type == ICMP_TIME_EXCEEDED)
+ icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+ info, &temp_saddr);
+ else
+ icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
+ info, &temp_saddr);
+ if (rt)
+ ip6_rt_put(rt);
+
+ kfree_skb(skb2);
+
+ return 0;
+}
+EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach);
+
static void icmpv6_echo_reply(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
@@ -585,7 +655,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.daddr = ipv6_hdr(skb)->saddr;
if (saddr)
fl6.saddr = *saddr;
- fl6.flowi6_oif = l3mdev_fib_oif(skb->dev);
+ fl6.flowi6_oif = skb->dev->ifindex;
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index d08fd2d48..e0170f62b 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -109,7 +109,8 @@ static inline bool ila_csum_neutral_set(struct ila_identifier ident)
return !!(ident.csum_neutral);
}
-void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p);
+void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
+ bool set_csum_neutral);
void ila_init_saved_csum(struct ila_params *p);
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 0e94042d1..ec9efbcda 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -34,12 +34,12 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr,
if (p->locator_match.v64) {
diff = p->csum_diff;
} else {
- diff = compute_csum_diff8((__be32 *)iaddr,
- (__be32 *)&p->locator);
+ diff = compute_csum_diff8((__be32 *)&p->locator,
+ (__be32 *)iaddr);
}
fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ?
- ~CSUM_NEUTRAL_FLAG : CSUM_NEUTRAL_FLAG);
+ CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG);
diff = csum_add(diff, fval);
@@ -103,7 +103,8 @@ static void ila_csum_adjust_transport(struct sk_buff *skb,
iaddr->loc = p->locator;
}
-void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
+void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
+ bool set_csum_neutral)
{
struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
@@ -114,7 +115,8 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
* is a locator being translated to a SIR address.
* Perform (receiver) checksum-neutral translation.
*/
- ila_csum_do_neutral(iaddr, p);
+ if (!set_csum_neutral)
+ ila_csum_do_neutral(iaddr, p);
} else {
switch (p->csum_mode) {
case ILA_CSUM_ADJUST_TRANSPORT:
@@ -138,8 +140,8 @@ void ila_init_saved_csum(struct ila_params *p)
return;
p->csum_diff = compute_csum_diff8(
- (__be32 *)&p->locator_match,
- (__be32 *)&p->locator);
+ (__be32 *)&p->locator,
+ (__be32 *)&p->locator_match);
}
static int __init ila_init(void)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 1dfb64166..c8314c6b6 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -26,7 +26,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
- ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
+ ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), true);
return dst->lwtstate->orig_output(net, sk, skb);
@@ -42,7 +42,7 @@ static int ila_input(struct sk_buff *skb)
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
- ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
+ ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false);
return dst->lwtstate->orig_input(skb);
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index a90e57229..e6eca5fdf 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -210,14 +210,14 @@ static void ila_free_cb(void *ptr, void *arg)
}
}
-static int ila_xlat_addr(struct sk_buff *skb);
+static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral);
static unsigned int
ila_nf_input(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- ila_xlat_addr(skb);
+ ila_xlat_addr(skb, false);
return NF_ACCEPT;
}
@@ -597,7 +597,7 @@ static struct pernet_operations ila_net_ops = {
.size = sizeof(struct ila_net),
};
-static int ila_xlat_addr(struct sk_buff *skb)
+static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
{
struct ila_map *ila;
struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -616,7 +616,7 @@ static int ila_xlat_addr(struct sk_buff *skb)
ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan);
if (ila)
- ila_update_ipv6_locator(skb, &ila->xp.ip);
+ ila_update_ipv6_locator(skb, &ila->xp.ip, set_csum_neutral);
rcu_read_unlock();
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 776d14511..edc3daab3 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -519,8 +519,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
- skb_set_inner_protocol(skb, protocol);
-
return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
NEXTHDR_GRE);
}
@@ -650,7 +648,6 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = skb->protocol;
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index 14dacc544..713676f14 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -39,7 +39,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
if (!send)
goto out;
- send(skb, type, code, info);
+ send(skb, type, code, info, NULL);
out:
rcu_read_unlock();
}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 94611e450..aacfb4bce 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -323,6 +323,7 @@ int ip6_input(struct sk_buff *skb)
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
ip6_input_finish);
}
+EXPORT_SYMBOL_GPL(ip6_input);
int ip6_mc_input(struct sk_buff *skb)
{
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 635b8d340..1dfc402d9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -368,7 +368,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
if (skb->ignore_df)
return false;
- if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
return false;
return true;
@@ -910,6 +910,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
int err;
int flags = 0;
+ if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
+ (!*dst || !(*dst)->error)) {
+ err = l3mdev_get_saddr6(net, sk, fl6);
+ if (err)
+ goto out_err;
+ }
+
/* The correct way to handle this would be to do
* ip6_route_get_saddr, and then ip6_route_output; however,
* the route-specific preferred source forces the
@@ -999,10 +1006,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
return 0;
out_err_release:
- if (err == -ENETUNREACH)
- IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
dst_release(*dst);
*dst = NULL;
+out_err:
+ if (err == -ENETUNREACH)
+ IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
return err;
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d90a11f14..5bd3afdcc 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb)
goto discard;
}
- XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
-
rcu_read_unlock();
- return xfrm6_rcv(skb);
+ return xfrm6_rcv_tnl(skb, t);
}
rcu_read_unlock();
return -EINVAL;
@@ -340,6 +338,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
struct net_device *dev;
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
+ struct xfrm_mode *inner_mode;
struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
u32 orig_mark = skb->mark;
int ret;
@@ -357,7 +356,19 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
}
x = xfrm_input_state(skb);
- family = x->inner_mode->afinfo->family;
+
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->afinfo->family;
skb->mark = be32_to_cpu(t->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 487ef3bc7..7f4265b16 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -921,6 +921,7 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca
cache->mfc_un.res.maxvif = vifi + 1;
}
}
+ cache->mfc_un.res.lastuse = jiffies;
}
static int mif6_add(struct net *net, struct mr6_table *mrt,
@@ -1592,14 +1593,15 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk)
if (likely(mrt->mroute6_sk == NULL)) {
mrt->mroute6_sk = sk;
net->ipv6.devconf_all->mc_forwarding++;
- inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
- NETCONFA_IFINDEX_ALL,
- net->ipv6.devconf_all);
- }
- else
+ } else {
err = -EADDRINUSE;
+ }
write_unlock_bh(&mrt_lock);
+ if (!err)
+ inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
rtnl_unlock();
return err;
@@ -1617,11 +1619,11 @@ int ip6mr_sk_done(struct sock *sk)
write_lock_bh(&mrt_lock);
mrt->mroute6_sk = NULL;
net->ipv6.devconf_all->mc_forwarding--;
+ write_unlock_bh(&mrt_lock);
inet6_netconf_notify_devconf(net,
NETCONFA_MC_FORWARDING,
NETCONFA_IFINDEX_ALL,
net->ipv6.devconf_all);
- write_unlock_bh(&mrt_lock);
mroute_clean_tables(mrt, false);
err = 0;
@@ -2091,6 +2093,7 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
vif = cache->mf6c_parent;
cache->mfc_un.res.pkt++;
cache->mfc_un.res.bytes += skb->len;
+ cache->mfc_un.res.lastuse = jiffies;
if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) {
struct mfc6_cache *cache_proxy;
@@ -2233,10 +2236,11 @@ int ip6_mr_input(struct sk_buff *skb)
static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
struct mfc6_cache *c, struct rtmsg *rtm)
{
- int ct;
- struct rtnexthop *nhp;
- struct nlattr *mp_attr;
struct rta_mfc_stats mfcs;
+ struct nlattr *mp_attr;
+ struct rtnexthop *nhp;
+ unsigned long lastuse;
+ int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
if (c->mf6c_parent >= MAXMIFS)
@@ -2266,18 +2270,23 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
nla_nest_end(skb, mp_attr);
+ lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+ lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
- if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0)
+ if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
+ nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
+ RTA_PAD))
return -EMSGSIZE;
rtm->rtm_type = RTN_MULTICAST;
return 1;
}
-int ip6mr_get_route(struct net *net,
- struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
+ int nowait, u32 portid)
{
int err;
struct mr6_table *mrt;
@@ -2322,6 +2331,7 @@ int ip6mr_get_route(struct net *net,
return -ENOMEM;
}
+ NETLINK_CB(skb2).portid = portid;
skb_reset_transport_header(skb2);
skb_put(skb2, sizeof(struct ipv6hdr));
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a9895e15e..5330262ab 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -98,7 +98,6 @@ int ip6_ra_control(struct sock *sk, int sel)
return 0;
}
-static
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
struct ipv6_txoptions *opt)
{
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index c245895a3..fe65cdc28 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -73,15 +73,6 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
-/* Set to 3 to get tracing... */
-#define ND_DEBUG 1
-
-#define ND_PRINTK(val, level, fmt, ...) \
-do { \
- if (val <= ND_DEBUG) \
- net_##level##_ratelimited(fmt, ##__VA_ARGS__); \
-} while (0)
-
static u32 ndisc_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
@@ -150,11 +141,10 @@ struct neigh_table nd_tbl = {
};
EXPORT_SYMBOL_GPL(nd_tbl);
-static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
+void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
+ int data_len, int pad)
{
- int pad = ndisc_addr_option_pad(skb->dev->type);
- int data_len = skb->dev->addr_len;
- int space = ndisc_opt_addr_space(skb->dev);
+ int space = __ndisc_opt_addr_space(data_len, pad);
u8 *opt = skb_put(skb, space);
opt[0] = type;
@@ -171,6 +161,23 @@ static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
if (space > 0)
memset(opt, 0, space);
}
+EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option);
+
+static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
+ void *data, u8 icmp6_type)
+{
+ __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len,
+ ndisc_addr_option_pad(skb->dev->type));
+ ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type);
+}
+
+static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb,
+ void *ha,
+ const u8 *ops_data)
+{
+ ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT);
+ ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data);
+}
static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
struct nd_opt_hdr *end)
@@ -185,24 +192,28 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
return cur <= end && cur->nd_opt_type == type ? cur : NULL;
}
-static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
+static inline int ndisc_is_useropt(const struct net_device *dev,
+ struct nd_opt_hdr *opt)
{
return opt->nd_opt_type == ND_OPT_RDNSS ||
- opt->nd_opt_type == ND_OPT_DNSSL;
+ opt->nd_opt_type == ND_OPT_DNSSL ||
+ ndisc_ops_is_useropt(dev, opt->nd_opt_type);
}
-static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
+static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev,
+ struct nd_opt_hdr *cur,
struct nd_opt_hdr *end)
{
if (!cur || !end || cur >= end)
return NULL;
do {
cur = ((void *)cur) + (cur->nd_opt_len << 3);
- } while (cur < end && !ndisc_is_useropt(cur));
- return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
+ } while (cur < end && !ndisc_is_useropt(dev, cur));
+ return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL;
}
-struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
+ u8 *opt, int opt_len,
struct ndisc_options *ndopts)
{
struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
@@ -217,6 +228,8 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
l = nd_opt->nd_opt_len << 3;
if (opt_len < l || l == 0)
return NULL;
+ if (ndisc_ops_parse_options(dev, nd_opt, ndopts))
+ goto next_opt;
switch (nd_opt->nd_opt_type) {
case ND_OPT_SOURCE_LL_ADDR:
case ND_OPT_TARGET_LL_ADDR:
@@ -243,7 +256,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
break;
#endif
default:
- if (ndisc_is_useropt(nd_opt)) {
+ if (ndisc_is_useropt(dev, nd_opt)) {
ndopts->nd_useropts_end = nd_opt;
if (!ndopts->nd_useropts)
ndopts->nd_useropts = nd_opt;
@@ -260,6 +273,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
nd_opt->nd_opt_len);
}
}
+next_opt:
opt_len -= l;
nd_opt = ((void *)nd_opt) + l;
}
@@ -509,7 +523,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
if (!dev->addr_len)
inc_opt = 0;
if (inc_opt)
- optlen += ndisc_opt_addr_space(dev);
+ optlen += ndisc_opt_addr_space(dev,
+ NDISC_NEIGHBOUR_ADVERTISEMENT);
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -528,8 +543,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
if (inc_opt)
ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
- dev->dev_addr);
-
+ dev->dev_addr,
+ NDISC_NEIGHBOUR_ADVERTISEMENT);
ndisc_send_skb(skb, daddr, src_addr);
}
@@ -574,7 +589,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
if (ipv6_addr_any(saddr))
inc_opt = false;
if (inc_opt)
- optlen += ndisc_opt_addr_space(dev);
+ optlen += ndisc_opt_addr_space(dev,
+ NDISC_NEIGHBOUR_SOLICITATION);
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -590,7 +606,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
if (inc_opt)
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
- dev->dev_addr);
+ dev->dev_addr,
+ NDISC_NEIGHBOUR_SOLICITATION);
ndisc_send_skb(skb, daddr, saddr);
}
@@ -626,7 +643,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
}
#endif
if (send_sllao)
- optlen += ndisc_opt_addr_space(dev);
+ optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION);
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -641,7 +658,8 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
if (send_sllao)
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
- dev->dev_addr);
+ dev->dev_addr,
+ NDISC_ROUTER_SOLICITATION);
ndisc_send_skb(skb, daddr, saddr);
}
@@ -702,6 +720,15 @@ static int pndisc_is_router(const void *pkey,
return ret;
}
+void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
+ const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
+ struct ndisc_options *ndopts)
+{
+ neigh_update(neigh, lladdr, new, flags);
+ /* report ndisc ops about neighbour update */
+ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts);
+}
+
static void ndisc_recv_ns(struct sk_buff *skb)
{
struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
@@ -738,7 +765,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
return;
}
- if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+ if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
ND_PRINTK(2, warn, "NS: invalid ND options\n");
return;
}
@@ -856,9 +883,10 @@ have_ifp:
neigh = __neigh_lookup(&nd_tbl, saddr, dev,
!inc || lladdr || !dev->addr_len);
if (neigh)
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
- NEIGH_UPDATE_F_OVERRIDE);
+ NEIGH_UPDATE_F_OVERRIDE,
+ NDISC_NEIGHBOUR_SOLICITATION, &ndopts);
if (neigh || !dev->header_ops) {
ndisc_send_na(dev, saddr, &msg->target, !!is_router,
true, (ifp != NULL && inc), inc);
@@ -911,7 +939,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
idev->cnf.drop_unsolicited_na)
return;
- if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+ if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
ND_PRINTK(2, warn, "NS: invalid ND option\n");
return;
}
@@ -967,12 +995,13 @@ static void ndisc_recv_na(struct sk_buff *skb)
goto out;
}
- neigh_update(neigh, lladdr,
+ ndisc_update(dev, neigh, lladdr,
msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
(msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0));
+ (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0),
+ NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts);
if ((old_flags & ~neigh->flags) & NTF_ROUTER) {
/*
@@ -1017,7 +1046,7 @@ static void ndisc_recv_rs(struct sk_buff *skb)
goto out;
/* Parse ND options */
- if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) {
+ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) {
ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n");
goto out;
}
@@ -1031,10 +1060,11 @@ static void ndisc_recv_rs(struct sk_buff *skb)
neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1);
if (neigh) {
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
- NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER,
+ NDISC_ROUTER_SOLICITATION, &ndopts);
neigh_release(neigh);
}
out:
@@ -1135,7 +1165,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- if (!ndisc_parse_options(opt, optlen, &ndopts)) {
+ if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) {
ND_PRINTK(2, warn, "RA: invalid ND options\n");
return;
}
@@ -1329,11 +1359,12 @@ skip_linkparms:
goto out;
}
}
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- NEIGH_UPDATE_F_ISROUTER);
+ NEIGH_UPDATE_F_ISROUTER,
+ NDISC_ROUTER_ADVERTISEMENT, &ndopts);
}
if (!ipv6_accept_ra(in6_dev)) {
@@ -1421,7 +1452,8 @@ skip_routeinfo:
struct nd_opt_hdr *p;
for (p = ndopts.nd_useropts;
p;
- p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) {
+ p = ndisc_next_useropt(skb->dev, p,
+ ndopts.nd_useropts_end)) {
ndisc_ra_useropt(skb, p);
}
}
@@ -1459,7 +1491,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
return;
}
- if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts))
+ if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts))
return;
if (!ndopts.nd_opts_rh) {
@@ -1504,7 +1536,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
struct dst_entry *dst;
struct flowi6 fl6;
int rd_len;
- u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+ u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
+ ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
int oif = l3mdev_fib_oif(dev);
bool ret;
@@ -1563,7 +1596,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
memcpy(ha_buf, neigh->ha, dev->addr_len);
read_unlock_bh(&neigh->lock);
ha = ha_buf;
- optlen += ndisc_opt_addr_space(dev);
+ optlen += ndisc_redirect_opt_addr_space(dev, neigh,
+ ops_data_buf,
+ &ops_data);
} else
read_unlock_bh(&neigh->lock);
@@ -1594,7 +1629,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
*/
if (ha)
- ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha);
+ ndisc_fill_redirect_addr_option(buff, ha, ops_data);
/*
* build redirect option and copy skb over to the new packet.
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 63e06c3dd..552fac2f3 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -73,22 +73,22 @@ ip6_packet_match(const struct sk_buff *skb,
unsigned long ret;
const struct ipv6hdr *ipv6 = ipv6_hdr(skb);
-#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg)))
-
- if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
- &ip6info->src), IP6T_INV_SRCIP) ||
- FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
- &ip6info->dst), IP6T_INV_DSTIP))
+ if (NF_INVF(ip6info, IP6T_INV_SRCIP,
+ ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
+ &ip6info->src)) ||
+ NF_INVF(ip6info, IP6T_INV_DSTIP,
+ ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
+ &ip6info->dst)))
return false;
ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask);
- if (FWINV(ret != 0, IP6T_INV_VIA_IN))
+ if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0))
return false;
ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask);
- if (FWINV(ret != 0, IP6T_INV_VIA_OUT))
+ if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0))
return false;
/* ... might want to do something with class and flowlabel here ... */
@@ -402,23 +402,12 @@ ip6t_do_table(struct sk_buff *skb,
else return verdict;
}
-static bool find_jump_target(const struct xt_table_info *t,
- const struct ip6t_entry *target)
-{
- struct ip6t_entry *iter;
-
- xt_entry_foreach(iter, t->entries, t->size) {
- if (iter == target)
- return true;
- }
- return false;
-}
-
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
mark_source_chains(const struct xt_table_info *newinfo,
- unsigned int valid_hooks, void *entry0)
+ unsigned int valid_hooks, void *entry0,
+ unsigned int *offsets)
{
unsigned int hook;
@@ -487,10 +476,11 @@ mark_source_chains(const struct xt_table_info *newinfo,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
/* This a jump; chase it. */
+ if (!xt_find_jump_offset(offsets, newpos,
+ newinfo->number))
+ return 0;
e = (struct ip6t_entry *)
(entry0 + newpos);
- if (!find_jump_target(newinfo, e))
- return 0;
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
@@ -724,6 +714,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ip6t_replace *repl)
{
struct ip6t_entry *iter;
+ unsigned int *offsets;
unsigned int i;
int ret = 0;
@@ -736,6 +727,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
newinfo->underflow[i] = 0xFFFFFFFF;
}
+ offsets = xt_alloc_entry_offsets(newinfo->number);
+ if (!offsets)
+ return -ENOMEM;
i = 0;
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter, entry0, newinfo->size) {
@@ -745,15 +739,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
repl->underflow,
repl->valid_hooks);
if (ret != 0)
- return ret;
+ goto out_free;
+ if (i < repl->num_entries)
+ offsets[i] = (void *)iter - entry0;
++i;
if (strcmp(ip6t_get_target(iter)->u.user.name,
XT_ERROR_TARGET) == 0)
++newinfo->stacksize;
}
+ ret = -EINVAL;
if (i != repl->num_entries)
- return -EINVAL;
+ goto out_free;
/* Check hooks all assigned */
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
@@ -761,13 +758,16 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
if (!(repl->valid_hooks & (1 << i)))
continue;
if (newinfo->hook_entry[i] == 0xFFFFFFFF)
- return -EINVAL;
+ goto out_free;
if (newinfo->underflow[i] == 0xFFFFFFFF)
- return -EINVAL;
+ goto out_free;
}
- if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
- return -ELOOP;
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
+ ret = -ELOOP;
+ goto out_free;
+ }
+ kvfree(offsets);
/* Finally, each sanity check must pass */
i = 0;
@@ -788,6 +788,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
}
return ret;
+ out_free:
+ kvfree(offsets);
+ return ret;
}
static void
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index cb2b28883..2b1a9dcdb 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -83,10 +83,6 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
{
if (state->hook == NF_INET_LOCAL_OUT)
return ip6t_mangle_out(skb, state);
- if (state->hook == NF_INET_POST_ROUTING)
- return ip6t_do_table(skb, state,
- state->net->ipv6.ip6table_mangle);
- /* INPUT/FORWARD */
return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
}
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 71d995ff3..2535223ba 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
struct in6_addr saddr, daddr;
u_int8_t hop_limit;
u32 mark, flowlabel;
+ int err;
/* malformed packet, drop it */
if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
@@ -46,13 +47,16 @@ static unsigned int nf_route_table_hook(void *priv,
flowlabel = *((u32 *)ipv6_hdr(skb));
ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_QUEUE &&
+ if (ret != NF_DROP && ret != NF_STOLEN &&
(memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
- flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
- return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP;
+ flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
+ err = ip6_route_me_harder(state->net, skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
return ret;
}
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 533cd5719..92bda9908 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -47,6 +47,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = {
.eval = nft_reject_ipv6_eval,
.init = nft_reject_init,
.dump = nft_reject_dump,
+ .validate = nft_reject_validate,
};
static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 408660477..0e983b694 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -55,7 +55,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct icmp6hdr user_icmph;
int addr_type;
struct in6_addr *daddr;
- int iif = 0;
+ int oif = 0;
struct flowi6 fl6;
int err;
struct dst_entry *dst;
@@ -78,25 +78,30 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (u->sin6_family != AF_INET6) {
return -EAFNOSUPPORT;
}
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != u->sin6_scope_id) {
- return -EINVAL;
- }
daddr = &(u->sin6_addr);
- iif = u->sin6_scope_id;
+ if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
+ oif = u->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
daddr = &sk->sk_v6_daddr;
}
- if (!iif)
- iif = sk->sk_bound_dev_if;
+ if (!oif)
+ oif = sk->sk_bound_dev_if;
+
+ if (!oif)
+ oif = np->sticky_pktinfo.ipi6_ifindex;
+
+ if (!oif && ipv6_addr_is_multicast(daddr))
+ oif = np->mcast_oif;
+ else if (!oif)
+ oif = np->ucast_oif;
addr_type = ipv6_addr_type(daddr);
- if (__ipv6_addr_needs_scope_id(addr_type) && !iif)
- return -EINVAL;
- if (addr_type & IPV6_ADDR_MAPPED)
+ if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) ||
+ (addr_type & IPV6_ADDR_MAPPED) ||
+ (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if))
return -EINVAL;
/* TODO: use ip6_datagram_send_ctl to get options from cmsg */
@@ -106,15 +111,14 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.saddr = np->saddr;
fl6.daddr = *daddr;
+ fl6.flowi6_oif = oif;
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
- else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
+ ipc6.tclass = np->tclass;
+ fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr);
if (IS_ERR(dst))
@@ -142,7 +146,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
pfh.family = AF_INET6;
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- ipc6.tclass = np->tclass;
ipc6.dontfrag = np->dontfrag;
ipc6.opt = NULL;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 896350df6..590dd1f77 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -878,6 +878,11 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (inet->hdrincl)
fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
+ if (ipc6.tclass < 0)
+ ipc6.tclass = np->tclass;
+
+ fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
@@ -886,9 +891,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (ipc6.hlimit < 0)
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- if (ipc6.tclass < 0)
- ipc6.tclass = np->tclass;
-
if (ipc6.dontfrag < 0)
ipc6.dontfrag = np->dontfrag;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 520b7884d..269218aac 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1042,8 +1042,8 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
return pcpu_rt;
}
-static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
- struct flowi6 *fl6, int flags)
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int flags)
{
struct fib6_node *fn, *saved_fn;
struct rt6_info *rt;
@@ -1139,6 +1139,7 @@ redo_rt6_select:
}
}
+EXPORT_SYMBOL_GPL(ip6_pol_route);
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
struct flowi6 *fl6, int flags)
@@ -1985,9 +1986,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
- if (cfg->fc_table)
+ if (cfg->fc_table) {
grt = ip6_nh_lookup_table(net, cfg, gw_addr);
+ if (grt) {
+ if (grt->rt6i_flags & RTF_GATEWAY ||
+ (dev && dev != grt->dst.dev)) {
+ ip6_rt_put(grt);
+ grt = NULL;
+ }
+ }
+ }
+
if (!grt)
grt = rt6_lookup(net, gw_addr, NULL,
cfg->fc_ifindex, 1);
@@ -2200,7 +2210,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
* first-hop router for the specified ICMP Destination Address.
*/
- if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
+ if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
return;
}
@@ -2235,12 +2245,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
* We have finally decided to accept it.
*/
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
(on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- NEIGH_UPDATE_F_ISROUTER))
- );
+ NEIGH_UPDATE_F_ISROUTER)),
+ NDISC_REDIRECT, &ndopts);
nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
if (!nrt)
@@ -2585,23 +2595,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
return rt;
}
-int ip6_route_get_saddr(struct net *net,
- struct rt6_info *rt,
- const struct in6_addr *daddr,
- unsigned int prefs,
- struct in6_addr *saddr)
-{
- struct inet6_dev *idev =
- rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
- int err = 0;
- if (rt && rt->rt6i_prefsrc.plen)
- *saddr = rt->rt6i_prefsrc.addr;
- else
- err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
- daddr, prefs, saddr);
- return err;
-}
-
/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
struct net_device *dev;
@@ -3209,7 +3202,9 @@ static int rt6_fill_node(struct net *net,
if (iif) {
#ifdef CONFIG_IPV6_MROUTE
if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
- int err = ip6mr_get_route(net, skb, rtm, nowait);
+ int err = ip6mr_get_route(net, skb, rtm, nowait,
+ portid);
+
if (err <= 0) {
if (!nowait) {
if (err == 0)
@@ -3306,6 +3301,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
err = -EINVAL;
memset(&fl6, 0, sizeof(fl6));
+ rtm = nlmsg_data(nlh);
+ fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
if (tb[RTA_SRC]) {
if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0619ac708..182b6a9be 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -479,47 +479,12 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
dev_put(dev);
}
-/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
- * if sufficient data bytes are available
- */
-static int ipip6_err_gen_icmpv6_unreach(struct sk_buff *skb)
-{
- int ihl = ((const struct iphdr *)skb->data)->ihl*4;
- struct rt6_info *rt;
- struct sk_buff *skb2;
-
- if (!pskb_may_pull(skb, ihl + sizeof(struct ipv6hdr) + 8))
- return 1;
-
- skb2 = skb_clone(skb, GFP_ATOMIC);
-
- if (!skb2)
- return 1;
-
- skb_dst_drop(skb2);
- skb_pull(skb2, ihl);
- skb_reset_network_header(skb2);
-
- rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0);
-
- if (rt && rt->dst.dev)
- skb2->dev = rt->dst.dev;
-
- icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
-
- if (rt)
- ip6_rt_put(rt);
-
- kfree_skb(skb2);
-
- return 0;
-}
-
static int ipip6_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (const struct iphdr *)skb->data;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ unsigned int data_len = 0;
struct ip_tunnel *t;
int err;
@@ -544,6 +509,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
return 0;
+ data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
case ICMP_REDIRECT:
break;
@@ -571,11 +537,11 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
goto out;
}
- if (t->parms.iph.daddr == 0)
+ err = 0;
+ if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len))
goto out;
- err = 0;
- if (!ipip6_err_gen_icmpv6_unreach(skb))
+ if (t->parms.iph.daddr == 0)
goto out;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
@@ -722,12 +688,19 @@ out:
return 0;
}
-static const struct tnl_ptk_info tpi = {
+static const struct tnl_ptk_info ipip_tpi = {
/* no tunnel info required for ipip. */
.proto = htons(ETH_P_IP),
};
-static int ipip_rcv(struct sk_buff *skb)
+#if IS_ENABLED(CONFIG_MPLS)
+static const struct tnl_ptk_info mplsip_tpi = {
+ /* no tunnel info required for mplsip. */
+ .proto = htons(ETH_P_MPLS_UC),
+};
+#endif
+
+static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
const struct iphdr *iph;
struct ip_tunnel *tunnel;
@@ -736,15 +709,23 @@ static int ipip_rcv(struct sk_buff *skb)
tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
iph->saddr, iph->daddr);
if (tunnel) {
- if (tunnel->parms.iph.protocol != IPPROTO_IPIP &&
+ const struct tnl_ptk_info *tpi;
+
+ if (tunnel->parms.iph.protocol != ipproto &&
tunnel->parms.iph.protocol != 0)
goto drop;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- if (iptunnel_pull_header(skb, 0, tpi.proto, false))
+#if IS_ENABLED(CONFIG_MPLS)
+ if (ipproto == IPPROTO_MPLS)
+ tpi = &mplsip_tpi;
+ else
+#endif
+ tpi = &ipip_tpi;
+ if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
+ return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
}
return 1;
@@ -754,6 +735,18 @@ drop:
return 0;
}
+static int ipip_rcv(struct sk_buff *skb)
+{
+ return sit_tunnel_rcv(skb, IPPROTO_IPIP);
+}
+
+#if IS_ENABLED(CONFIG_MPLS)
+static int mplsip_rcv(struct sk_buff *skb)
+{
+ return sit_tunnel_rcv(skb, IPPROTO_MPLS);
+}
+#endif
+
/*
* If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function
* stores the embedded IPv4 address in v4dst and returns true.
@@ -825,9 +818,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
u8 protocol = IPPROTO_IPV6;
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
- if (skb->protocol != htons(ETH_P_IPV6))
- goto tx_error;
-
if (tos == 1)
tos = ipv6_get_dsfield(iph6);
@@ -995,7 +985,8 @@ tx_error:
return NETDEV_TX_OK;
}
-static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb,
+ struct net_device *dev, u8 ipproto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tiph = &tunnel->parms.iph;
@@ -1003,9 +994,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
goto tx_error;
- skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+ skb_set_inner_ipproto(skb, ipproto);
- ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP);
+ ip_tunnel_xmit(skb, dev, tiph, ipproto);
return NETDEV_TX_OK;
tx_error:
kfree_skb(skb);
@@ -1018,11 +1009,16 @@ static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
{
switch (skb->protocol) {
case htons(ETH_P_IP):
- ipip_tunnel_xmit(skb, dev);
+ sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP);
break;
case htons(ETH_P_IPV6):
ipip6_tunnel_xmit(skb, dev);
break;
+#if IS_ENABLED(CONFIG_MPLS)
+ case htons(ETH_P_MPLS_UC):
+ sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS);
+ break;
+#endif
default:
goto tx_err;
}
@@ -1130,6 +1126,16 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
}
#endif
+bool ipip6_valid_ip_proto(u8 ipproto)
+{
+ return ipproto == IPPROTO_IPV6 ||
+ ipproto == IPPROTO_IPIP ||
+#if IS_ENABLED(CONFIG_MPLS)
+ ipproto == IPPROTO_MPLS ||
+#endif
+ ipproto == 0;
+}
+
static int
ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
@@ -1189,9 +1195,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
goto done;
err = -EINVAL;
- if (p.iph.protocol != IPPROTO_IPV6 &&
- p.iph.protocol != IPPROTO_IPIP &&
- p.iph.protocol != 0)
+ if (!ipip6_valid_ip_proto(p.iph.protocol))
goto done;
if (p.iph.version != 4 ||
p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
@@ -1416,9 +1420,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[])
return 0;
proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
- if (proto != IPPROTO_IPV6 &&
- proto != IPPROTO_IPIP &&
- proto != 0)
+ if (!ipip6_valid_ip_proto(proto))
return -EINVAL;
return 0;
@@ -1760,6 +1762,14 @@ static struct xfrm_tunnel ipip_handler __read_mostly = {
.priority = 2,
};
+#if IS_ENABLED(CONFIG_MPLS)
+static struct xfrm_tunnel mplsip_handler __read_mostly = {
+ .handler = mplsip_rcv,
+ .err_handler = ipip6_err,
+ .priority = 2,
+};
+#endif
+
static void __net_exit sit_destroy_tunnels(struct net *net,
struct list_head *head)
{
@@ -1855,6 +1865,9 @@ static void __exit sit_cleanup(void)
rtnl_link_unregister(&sit_link_ops);
xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+#if IS_ENABLED(CONFIG_MPLS)
+ xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
+#endif
unregister_pernet_device(&sit_net_ops);
rcu_barrier(); /* Wait for completion of call_rcu()'s */
@@ -1864,7 +1877,7 @@ static int __init sit_init(void)
{
int err;
- pr_info("IPv6 over IPv4 tunneling driver\n");
+ pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n");
err = register_pernet_device(&sit_net_ops);
if (err < 0)
@@ -1879,6 +1892,13 @@ static int __init sit_init(void)
pr_info("%s: can't register ip4ip4\n", __func__);
goto xfrm_tunnel4_failed;
}
+#if IS_ENABLED(CONFIG_MPLS)
+ err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS);
+ if (err < 0) {
+ pr_info("%s: can't register mplsip\n", __func__);
+ goto xfrm_tunnel_mpls_failed;
+ }
+#endif
err = rtnl_link_register(&sit_link_ops);
if (err < 0)
goto rtnl_link_failed;
@@ -1887,6 +1907,10 @@ out:
return err;
rtnl_link_failed:
+#if IS_ENABLED(CONFIG_MPLS)
+ xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
+xfrm_tunnel_mpls_failed:
+#endif
xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
xfrm_tunnel4_failed:
xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 45243bbe5..69c50e737 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -15,6 +15,9 @@
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/inet_frag.h>
+#ifdef CONFIG_NETLABEL
+#include <net/calipso.h>
+#endif
static int one = 1;
static int auto_flowlabels_min;
@@ -106,6 +109,22 @@ static struct ctl_table ipv6_rotable[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one
},
+#ifdef CONFIG_NETLABEL
+ {
+ .procname = "calipso_cache_enable",
+ .data = &calipso_cache_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "calipso_cache_bucket_size",
+ .data = &calipso_cache_bucketsize,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NETLABEL */
{ }
};
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a2aaca9be..27a968b88 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -459,6 +459,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
@@ -479,8 +480,10 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
rcu_read_lock();
- err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt),
- np->tclass);
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
+ err = ip6_xmit(sk, skb, fl6, opt, np->tclass);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -492,6 +495,7 @@ done:
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
+ kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts);
}
@@ -542,26 +546,33 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval,
AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
-static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr, int nbytes)
+static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr,
+ const struct tcphdr *th, int nbytes)
{
struct tcp6_pseudohdr *bp;
struct scatterlist sg;
+ struct tcphdr *_th;
- bp = &hp->md5_blk.ip6;
+ bp = hp->scratch;
/* 1. TCP pseudo-header (RFC2460) */
bp->saddr = *saddr;
bp->daddr = *daddr;
bp->protocol = cpu_to_be32(IPPROTO_TCP);
bp->len = cpu_to_be32(nbytes);
- sg_init_one(&sg, bp, sizeof(*bp));
- ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+ _th = (struct tcphdr *)(bp + 1);
+ memcpy(_th, th, sizeof(*th));
+ _th->check = 0;
+
+ sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+ sizeof(*bp) + sizeof(*th));
return crypto_ahash_update(hp->md5_req);
}
-static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
const struct in6_addr *daddr, struct in6_addr *saddr,
const struct tcphdr *th)
{
@@ -575,9 +586,7 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
if (crypto_ahash_init(req))
goto clear_hash;
- if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_header(hp, th))
+ if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
@@ -622,9 +631,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
if (crypto_ahash_init(req))
goto clear_hash;
- if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
- goto clear_hash;
- if (tcp_md5_hash_header(hp, th))
+ if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len))
goto clear_hash;
if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
goto clear_hash;
@@ -1131,7 +1138,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
but we make one more one thing there: reattach optmem
to newsk.
*/
- opt = rcu_dereference(np->opt);
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
if (opt) {
opt = ipv6_dup_options(newsk, opt);
RCU_INIT_POINTER(newnp->opt, opt);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 42a2edf7c..19ac3a1c3 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1207,6 +1207,11 @@ do_udp_sendmsg:
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ if (ipc6.tclass < 0)
+ ipc6.tclass = np->tclass;
+
+ fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+
dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
@@ -1217,9 +1222,6 @@ do_udp_sendmsg:
if (ipc6.hlimit < 0)
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- if (ipc6.tclass < 0)
- ipc6.tclass = np->tclass;
-
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
back_from_confirm:
@@ -1458,7 +1460,6 @@ struct proto udpv6_prot = {
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udpv6_setsockopt,
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 9cf097e20..fd6ef4148 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -50,7 +50,6 @@ struct proto udplitev6_prot = {
.unhash = udp_lib_unhash,
.get_port = udp_v6_get_port,
.obj_size = sizeof(struct udp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udpv6_setsockopt,
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 0eaab1fa6..b5789562a 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -21,8 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm6_extract_header(skb);
}
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+ struct ip6_tnl *t)
{
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
return xfrm_input(skb, nexthdr, spi, 0);
@@ -48,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
return -1;
}
-int xfrm6_rcv(struct sk_buff *skb)
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
{
return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
- 0);
+ 0, t);
}
-EXPORT_SYMBOL(xfrm6_rcv);
+EXPORT_SYMBOL(xfrm6_rcv_tnl);
+int xfrm6_rcv(struct sk_buff *skb)
+{
+ return xfrm6_rcv_tnl(skb, NULL);
+}
+EXPORT_SYMBOL(xfrm6_rcv);
int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
xfrm_address_t *saddr, u8 proto)
{
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index c074771a1..70a86adad 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -36,7 +36,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
int err;
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_oif = oif;
+ fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
if (saddr)
@@ -366,12 +366,12 @@ static void __net_exit xfrm6_net_sysctl_exit(struct net *net)
kfree(table);
}
#else /* CONFIG_SYSCTL */
-static int inline xfrm6_net_sysctl_init(struct net *net)
+static inline int xfrm6_net_sysctl_init(struct net *net)
{
return 0;
}
-static void inline xfrm6_net_sysctl_exit(struct net *net)
+static inline void xfrm6_net_sysctl_exit(struct net *net)
{
}
#endif
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 5743044cd..e1c0bbe79 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb)
__be32 spi;
spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
- return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi);
+ return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL);
}
static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 8d2f7c9b4..ccc244406 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -832,7 +832,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
struct sock *sk = sock->sk;
struct irda_sock *new, *self = irda_sk(sk);
struct sock *newsk;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
int err;
err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
@@ -900,7 +900,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
err = -EPERM; /* value does not seem to make sense. -arnd */
if (!new->tsap) {
pr_debug("%s(), dup failed!\n", __func__);
- kfree_skb(skb);
goto out;
}
@@ -919,7 +918,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
/* Clean up the original one to keep it in listen state */
irttp_listen(self->tsap);
- kfree_skb(skb);
sk->sk_ack_backlog--;
newsock->state = SS_CONNECTED;
@@ -927,6 +925,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
irda_connect_response(new);
err = 0;
out:
+ kfree_skb(skb);
release_sock(sk);
return err;
}
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index d4fdf8f7b..8f5678cb6 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -246,9 +246,6 @@ static int ircomm_tty_get_serial_info(struct ircomm_tty_cb *self,
{
struct serial_struct info;
- if (!retinfo)
- return -EFAULT;
-
memset(&info, 0, sizeof(info));
info.line = self->line;
info.flags = self->port.flags;
@@ -258,11 +255,6 @@ static int ircomm_tty_get_serial_info(struct ircomm_tty_cb *self,
/* For compatibility */
info.type = PORT_16550A;
- info.port = 0;
- info.irq = 0;
- info.xmit_fifo_size = 0;
- info.hub6 = 0;
- info.custom_divisor = 0;
if (copy_to_user(retinfo, &info, sizeof(*retinfo)))
return -EFAULT;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index fc3598a92..02b45a8e8 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -22,6 +22,7 @@
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/poll.h>
+#include <linux/security.h>
#include <net/sock.h>
#include <asm/ebcdic.h>
#include <asm/cpcmd.h>
@@ -530,8 +531,10 @@ static void iucv_sock_close(struct sock *sk)
static void iucv_sock_init(struct sock *sk, struct sock *parent)
{
- if (parent)
+ if (parent) {
sk->sk_type = parent->sk_type;
+ security_sk_clone(parent, sk);
+ }
}
static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern)
@@ -1033,6 +1036,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
+ size_t headroom, linear;
struct sk_buff *skb;
struct iucv_message txmsg = {0};
struct cmsghdr *cmsg;
@@ -1110,20 +1114,31 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
* this is fine for SOCK_SEQPACKET (unless we want to support
* segmented records using the MSG_EOR flag), but
* for SOCK_STREAM we might want to improve it in future */
- if (iucv->transport == AF_IUCV_TRANS_HIPER)
- skb = sock_alloc_send_skb(sk,
- len + sizeof(struct af_iucv_trans_hdr) + ETH_HLEN,
- noblock, &err);
- else
- skb = sock_alloc_send_skb(sk, len, noblock, &err);
+ headroom = (iucv->transport == AF_IUCV_TRANS_HIPER)
+ ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0;
+ if (headroom + len < PAGE_SIZE) {
+ linear = len;
+ } else {
+ /* In nonlinear "classic" iucv skb,
+ * reserve space for iucv_array
+ */
+ if (iucv->transport != AF_IUCV_TRANS_HIPER)
+ headroom += sizeof(struct iucv_array) *
+ (MAX_SKB_FRAGS + 1);
+ linear = PAGE_SIZE - headroom;
+ }
+ skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear,
+ noblock, &err, 0);
if (!skb)
goto out;
- if (iucv->transport == AF_IUCV_TRANS_HIPER)
- skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN);
- if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
- err = -EFAULT;
+ if (headroom)
+ skb_reserve(skb, headroom);
+ skb_put(skb, linear);
+ skb->len = len;
+ skb->data_len = len - linear;
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
+ if (err)
goto fail;
- }
/* wait if outstanding messages for iucv path has reached */
timeo = sock_sndtimeo(sk, noblock);
@@ -1148,49 +1163,67 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
atomic_dec(&iucv->msg_sent);
goto fail;
}
- goto release;
- }
- skb_queue_tail(&iucv->send_skb_q, skb);
-
- if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags)
- && skb->len <= 7) {
- err = iucv_send_iprm(iucv->path, &txmsg, skb);
+ } else { /* Classic VM IUCV transport */
+ skb_queue_tail(&iucv->send_skb_q, skb);
+
+ if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) &&
+ skb->len <= 7) {
+ err = iucv_send_iprm(iucv->path, &txmsg, skb);
+
+ /* on success: there is no message_complete callback */
+ /* for an IPRMDATA msg; remove skb from send queue */
+ if (err == 0) {
+ skb_unlink(skb, &iucv->send_skb_q);
+ kfree_skb(skb);
+ }
- /* on success: there is no message_complete callback
- * for an IPRMDATA msg; remove skb from send queue */
- if (err == 0) {
- skb_unlink(skb, &iucv->send_skb_q);
- kfree_skb(skb);
+ /* this error should never happen since the */
+ /* IUCV_IPRMDATA path flag is set... sever path */
+ if (err == 0x15) {
+ pr_iucv->path_sever(iucv->path, NULL);
+ skb_unlink(skb, &iucv->send_skb_q);
+ err = -EPIPE;
+ goto fail;
+ }
+ } else if (skb_is_nonlinear(skb)) {
+ struct iucv_array *iba = (struct iucv_array *)skb->head;
+ int i;
+
+ /* skip iucv_array lying in the headroom */
+ iba[0].address = (u32)(addr_t)skb->data;
+ iba[0].length = (u32)skb_headlen(skb);
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ iba[i + 1].address =
+ (u32)(addr_t)skb_frag_address(frag);
+ iba[i + 1].length = (u32)skb_frag_size(frag);
+ }
+ err = pr_iucv->message_send(iucv->path, &txmsg,
+ IUCV_IPBUFLST, 0,
+ (void *)iba, skb->len);
+ } else { /* non-IPRM Linear skb */
+ err = pr_iucv->message_send(iucv->path, &txmsg,
+ 0, 0, (void *)skb->data, skb->len);
}
-
- /* this error should never happen since the
- * IUCV_IPRMDATA path flag is set... sever path */
- if (err == 0x15) {
- pr_iucv->path_sever(iucv->path, NULL);
+ if (err) {
+ if (err == 3) {
+ user_id[8] = 0;
+ memcpy(user_id, iucv->dst_user_id, 8);
+ appl_id[8] = 0;
+ memcpy(appl_id, iucv->dst_name, 8);
+ pr_err(
+ "Application %s on z/VM guest %s exceeds message limit\n",
+ appl_id, user_id);
+ err = -EAGAIN;
+ } else {
+ err = -EPIPE;
+ }
skb_unlink(skb, &iucv->send_skb_q);
- err = -EPIPE;
goto fail;
}
- } else
- err = pr_iucv->message_send(iucv->path, &txmsg, 0, 0,
- (void *) skb->data, skb->len);
- if (err) {
- if (err == 3) {
- user_id[8] = 0;
- memcpy(user_id, iucv->dst_user_id, 8);
- appl_id[8] = 0;
- memcpy(appl_id, iucv->dst_name, 8);
- pr_err("Application %s on z/VM guest %s"
- " exceeds message limit\n",
- appl_id, user_id);
- err = -EAGAIN;
- } else
- err = -EPIPE;
- skb_unlink(skb, &iucv->send_skb_q);
- goto fail;
}
-release:
release_sock(sk);
return len;
@@ -1201,42 +1234,32 @@ out:
return err;
}
-/* iucv_fragment_skb() - Fragment a single IUCV message into multiple skb's
- *
- * Locking: must be called with message_q.lock held
- */
-static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len)
+static struct sk_buff *alloc_iucv_recv_skb(unsigned long len)
{
- int dataleft, size, copied = 0;
- struct sk_buff *nskb;
-
- dataleft = len;
- while (dataleft) {
- if (dataleft >= sk->sk_rcvbuf / 4)
- size = sk->sk_rcvbuf / 4;
- else
- size = dataleft;
-
- nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA);
- if (!nskb)
- return -ENOMEM;
-
- /* copy target class to control buffer of new skb */
- IUCV_SKB_CB(nskb)->class = IUCV_SKB_CB(skb)->class;
-
- /* copy data fragment */
- memcpy(nskb->data, skb->data + copied, size);
- copied += size;
- dataleft -= size;
-
- skb_reset_transport_header(nskb);
- skb_reset_network_header(nskb);
- nskb->len = size;
+ size_t headroom, linear;
+ struct sk_buff *skb;
+ int err;
- skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, nskb);
+ if (len < PAGE_SIZE) {
+ headroom = 0;
+ linear = len;
+ } else {
+ headroom = sizeof(struct iucv_array) * (MAX_SKB_FRAGS + 1);
+ linear = PAGE_SIZE - headroom;
+ }
+ skb = alloc_skb_with_frags(headroom + linear, len - linear,
+ 0, &err, GFP_ATOMIC | GFP_DMA);
+ WARN_ONCE(!skb,
+ "alloc of recv iucv skb len=%lu failed with errcode=%d\n",
+ len, err);
+ if (skb) {
+ if (headroom)
+ skb_reserve(skb, headroom);
+ skb_put(skb, linear);
+ skb->len = len;
+ skb->data_len = len - linear;
}
-
- return 0;
+ return skb;
}
/* iucv_process_message() - Receive a single outstanding IUCV message
@@ -1263,31 +1286,32 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
skb->len = 0;
}
} else {
- rc = pr_iucv->message_receive(path, msg,
+ if (skb_is_nonlinear(skb)) {
+ struct iucv_array *iba = (struct iucv_array *)skb->head;
+ int i;
+
+ iba[0].address = (u32)(addr_t)skb->data;
+ iba[0].length = (u32)skb_headlen(skb);
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ iba[i + 1].address =
+ (u32)(addr_t)skb_frag_address(frag);
+ iba[i + 1].length = (u32)skb_frag_size(frag);
+ }
+ rc = pr_iucv->message_receive(path, msg,
+ IUCV_IPBUFLST,
+ (void *)iba, len, NULL);
+ } else {
+ rc = pr_iucv->message_receive(path, msg,
msg->flags & IUCV_IPRMDATA,
skb->data, len, NULL);
+ }
if (rc) {
kfree_skb(skb);
return;
}
- /* we need to fragment iucv messages for SOCK_STREAM only;
- * for SOCK_SEQPACKET, it is only relevant if we support
- * record segmentation using MSG_EOR (see also recvmsg()) */
- if (sk->sk_type == SOCK_STREAM &&
- skb->truesize >= sk->sk_rcvbuf / 4) {
- rc = iucv_fragment_skb(sk, skb, len);
- kfree_skb(skb);
- skb = NULL;
- if (rc) {
- pr_iucv->path_sever(path, NULL);
- return;
- }
- skb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q);
- } else {
- skb_reset_transport_header(skb);
- skb_reset_network_header(skb);
- skb->len = len;
- }
+ WARN_ON_ONCE(skb->len != len);
}
IUCV_SKB_CB(skb)->offset = 0;
@@ -1306,7 +1330,7 @@ static void iucv_process_message_q(struct sock *sk)
struct sock_msg_q *p, *n;
list_for_each_entry_safe(p, n, &iucv->message_q.list, list) {
- skb = alloc_skb(iucv_msg_length(&p->msg), GFP_ATOMIC | GFP_DMA);
+ skb = alloc_iucv_recv_skb(iucv_msg_length(&p->msg));
if (!skb)
break;
iucv_process_message(sk, skb, p->path, &p->msg);
@@ -1801,7 +1825,7 @@ static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg)
if (len > sk->sk_rcvbuf)
goto save_message;
- skb = alloc_skb(iucv_msg_length(msg), GFP_ATOMIC | GFP_DMA);
+ skb = alloc_iucv_recv_skb(iucv_msg_length(msg));
if (!skb)
goto save_message;
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 7eaa000c9..88a2a3ba4 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -320,21 +320,29 @@ static union iucv_param *iucv_param_irq[NR_CPUS];
*
* Returns the result of the CP IUCV call.
*/
-static inline int iucv_call_b2f0(int command, union iucv_param *parm)
+static inline int __iucv_call_b2f0(int command, union iucv_param *parm)
{
register unsigned long reg0 asm ("0");
register unsigned long reg1 asm ("1");
int ccode;
reg0 = command;
- reg1 = virt_to_phys(parm);
+ reg1 = (unsigned long)parm;
asm volatile(
" .long 0xb2f01000\n"
" ipm %0\n"
" srl %0,28\n"
: "=d" (ccode), "=m" (*parm), "+d" (reg0), "+a" (reg1)
: "m" (*parm) : "cc");
- return (ccode == 1) ? parm->ctrl.iprcode : ccode;
+ return ccode;
+}
+
+static inline int iucv_call_b2f0(int command, union iucv_param *parm)
+{
+ int ccode;
+
+ ccode = __iucv_call_b2f0(command, parm);
+ return ccode == 1 ? parm->ctrl.iprcode : ccode;
}
/**
@@ -345,16 +353,12 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm)
* Returns the maximum number of connections or -EPERM is IUCV is not
* available.
*/
-static int iucv_query_maxconn(void)
+static int __iucv_query_maxconn(void *param, unsigned long *max_pathid)
{
register unsigned long reg0 asm ("0");
register unsigned long reg1 asm ("1");
- void *param;
int ccode;
- param = kzalloc(sizeof(union iucv_param), GFP_KERNEL|GFP_DMA);
- if (!param)
- return -ENOMEM;
reg0 = IUCV_QUERY;
reg1 = (unsigned long) param;
asm volatile (
@@ -362,8 +366,22 @@ static int iucv_query_maxconn(void)
" ipm %0\n"
" srl %0,28\n"
: "=d" (ccode), "+d" (reg0), "+d" (reg1) : : "cc");
+ *max_pathid = reg1;
+ return ccode;
+}
+
+static int iucv_query_maxconn(void)
+{
+ unsigned long max_pathid;
+ void *param;
+ int ccode;
+
+ param = kzalloc(sizeof(union iucv_param), GFP_KERNEL | GFP_DMA);
+ if (!param)
+ return -ENOMEM;
+ ccode = __iucv_query_maxconn(param, &max_pathid);
if (ccode == 0)
- iucv_max_pathid = reg1;
+ iucv_max_pathid = max_pathid;
kfree(param);
return ccode ? -EPERM : 0;
}
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index fda7f4715..16c2e03bd 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -88,13 +88,9 @@ struct kcm_proc_mux_state {
static int kcm_seq_open(struct inode *inode, struct file *file)
{
struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode);
- int err;
- err = seq_open_net(inode, file, &muxinfo->seq_ops,
+ return seq_open_net(inode, file, &muxinfo->seq_ops,
sizeof(struct kcm_proc_mux_state));
- if (err < 0)
- return err;
- return err;
}
static void kcm_format_mux_header(struct seq_file *seq)
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 48613f5dd..411693288 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1766,18 +1766,12 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
if (!csock)
return -ENOENT;
- prog = bpf_prog_get(info->bpf_fd);
+ prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER);
if (IS_ERR(prog)) {
err = PTR_ERR(prog);
goto out;
}
- if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
- bpf_prog_put(prog);
- err = -EINVAL;
- goto out;
- }
-
err = kcm_attach(sock, csock, prog);
if (err) {
bpf_prog_put(prog);
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 1e40dacaa..a2ed3bda4 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1855,6 +1855,9 @@ static __net_exit void l2tp_exit_net(struct net *net)
(void)l2tp_tunnel_delete(tunnel);
}
rcu_read_unlock_bh();
+
+ flush_workqueue(l2tp_wq);
+ rcu_barrier();
}
static struct pernet_operations l2tp_net_ops = {
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index e253c26f3..57fc5a46c 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -67,7 +67,6 @@ static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net)
return net_generic(net, l2tp_eth_net_id);
}
-static struct lock_class_key l2tp_eth_tx_busylock;
static int l2tp_eth_dev_init(struct net_device *dev)
{
struct l2tp_eth *priv = netdev_priv(dev);
@@ -75,7 +74,8 @@ static int l2tp_eth_dev_init(struct net_device *dev)
priv->dev = dev;
eth_hw_addr_random(dev);
eth_broadcast_addr(dev->broadcast);
- dev->qdisc_tx_busylock = &l2tp_eth_tx_busylock;
+ netdev_lockdep_set_classes(dev);
+
return 0;
}
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 6c54e03fe..ea2ae6664 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -611,6 +611,11 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ if (ipc6.tclass < 0)
+ ipc6.tclass = np->tclass;
+
+ fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
@@ -620,9 +625,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (ipc6.hlimit < 0)
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- if (ipc6.tclass < 0)
- ipc6.tclass = np->tclass;
-
if (ipc6.dontfrag < 0)
ipc6.dontfrag = np->dontfrag;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 652c250b9..232cb9203 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -856,7 +856,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
error = -ENOTCONN;
if (sk == NULL)
goto end;
- if (sk->sk_state != PPPOX_CONNECTED)
+ if (!(sk->sk_state & PPPOX_CONNECTED))
goto end;
error = -EBADF;
@@ -866,10 +866,8 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
pls = l2tp_session_priv(session);
tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock);
- if (tunnel == NULL) {
- error = -EBADF;
+ if (tunnel == NULL)
goto end_put_sess;
- }
inet = inet_sk(tunnel->sock);
if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) {
@@ -947,12 +945,11 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
}
*usockaddr_len = len;
+ error = 0;
sock_put(pls->tunnel_sock);
end_put_sess:
sock_put(sk);
- error = 0;
-
end:
return error;
}
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 6651a78e1..c4a1c3e84 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -10,6 +10,7 @@
*/
#include <linux/netdevice.h>
+#include <net/fib_rules.h>
#include <net/l3mdev.h>
/**
@@ -107,7 +108,7 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
*/
struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
- const struct flowi6 *fl6)
+ struct flowi6 *fl6)
{
struct dst_entry *dst = NULL;
struct net_device *dev;
@@ -160,3 +161,64 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
return rc;
}
EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
+
+int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
+ struct flowi6 *fl6)
+{
+ struct net_device *dev;
+ int rc = 0;
+
+ if (fl6->flowi6_oif) {
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
+ if (dev && netif_is_l3_slave(dev))
+ dev = netdev_master_upper_dev_get_rcu(dev);
+
+ if (dev && netif_is_l3_master(dev) &&
+ dev->l3mdev_ops->l3mdev_get_saddr6)
+ rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6);
+
+ rcu_read_unlock();
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(l3mdev_get_saddr6);
+
+/**
+ * l3mdev_fib_rule_match - Determine if flowi references an
+ * L3 master device
+ * @net: network namespace for device index lookup
+ * @fl: flow struct
+ */
+
+int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
+ struct fib_lookup_arg *arg)
+{
+ struct net_device *dev;
+ int rc = 0;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, fl->flowi_oif);
+ if (dev && netif_is_l3_master(dev) &&
+ dev->l3mdev_ops->l3mdev_fib_table) {
+ arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev);
+ rc = 1;
+ goto out;
+ }
+
+ dev = dev_get_by_index_rcu(net, fl->flowi_iif);
+ if (dev && netif_is_l3_master(dev) &&
+ dev->l3mdev_ops->l3mdev_fib_table) {
+ arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev);
+ rc = 1;
+ goto out;
+ }
+
+out:
+ rcu_read_unlock();
+
+ return rc;
+}
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 0a7305a97..afa94687d 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -312,6 +312,24 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
mutex_lock(&sta->ampdu_mlme.mtx);
if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
+ tid_agg_rx = rcu_dereference_protected(
+ sta->ampdu_mlme.tid_rx[tid],
+ lockdep_is_held(&sta->ampdu_mlme.mtx));
+
+ if (tid_agg_rx->dialog_token == dialog_token) {
+ ht_dbg_ratelimited(sta->sdata,
+ "updated AddBA Req from %pM on tid %u\n",
+ sta->sta.addr, tid);
+ /* We have no API to update the timeout value in the
+ * driver so reject the timeout update.
+ */
+ status = WLAN_STATUS_REQUEST_DECLINED;
+ ieee80211_send_addba_resp(sta->sdata, sta->sta.addr,
+ tid, dialog_token, status,
+ 1, buf_size, timeout);
+ goto end;
+ }
+
ht_dbg_ratelimited(sta->sdata,
"unexpected AddBA Req from %pM on tid %u\n",
sta->sta.addr, tid);
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 744ad1c0b..45319cc01 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -194,17 +194,21 @@ static void
ieee80211_agg_stop_txq(struct sta_info *sta, int tid)
{
struct ieee80211_txq *txq = sta->sta.txq[tid];
+ struct ieee80211_sub_if_data *sdata;
+ struct fq *fq;
struct txq_info *txqi;
if (!txq)
return;
txqi = to_txq_info(txq);
+ sdata = vif_to_sdata(txq->vif);
+ fq = &sdata->local->fq;
/* Lock here to protect against further seqno updates on dequeue */
- spin_lock_bh(&txqi->queue.lock);
+ spin_lock_bh(&fq->lock);
set_bit(IEEE80211_TXQ_STOP, &txqi->flags);
- spin_unlock_bh(&txqi->queue.lock);
+ spin_unlock_bh(&fq->lock);
}
static void
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 8cc49c044..543b1d4fc 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -997,6 +997,7 @@ static void sta_apply_mesh_params(struct ieee80211_local *local,
if (sta->mesh->plink_state != NL80211_PLINK_ESTAB)
changed = mesh_plink_inc_estab_count(sdata);
sta->mesh->plink_state = params->plink_state;
+ sta->mesh->aid = params->peer_aid;
ieee80211_mps_sta_status_update(sta);
changed |= ieee80211_mps_set_sta_local_pm(sta,
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index b251b2f7f..2906c1004 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -10,6 +10,7 @@
#include <linux/debugfs.h>
#include <linux/rtnetlink.h>
+#include <linux/vmalloc.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
#include "rate.h"
@@ -70,6 +71,177 @@ DEBUGFS_READONLY_FILE(wep_iv, "%#08x",
DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s",
local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver");
+struct aqm_info {
+ struct ieee80211_local *local;
+ size_t size;
+ size_t len;
+ unsigned char buf[0];
+};
+
+#define AQM_HDR_LEN 200
+#define AQM_HW_ENTRY_LEN 40
+#define AQM_TXQ_ENTRY_LEN 110
+
+static int aqm_open(struct inode *inode, struct file *file)
+{
+ struct ieee80211_local *local = inode->i_private;
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+ struct txq_info *txqi;
+ struct fq *fq = &local->fq;
+ struct aqm_info *info = NULL;
+ int len = 0;
+ int i;
+
+ if (!local->ops->wake_tx_queue)
+ return -EOPNOTSUPP;
+
+ len += AQM_HDR_LEN;
+ len += 6 * AQM_HW_ENTRY_LEN;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list)
+ len += AQM_TXQ_ENTRY_LEN;
+ list_for_each_entry_rcu(sta, &local->sta_list, list)
+ len += AQM_TXQ_ENTRY_LEN * ARRAY_SIZE(sta->sta.txq);
+ rcu_read_unlock();
+
+ info = vmalloc(len);
+ if (!info)
+ return -ENOMEM;
+
+ spin_lock_bh(&local->fq.lock);
+ rcu_read_lock();
+
+ file->private_data = info;
+ info->local = local;
+ info->size = len;
+ len = 0;
+
+ len += scnprintf(info->buf + len, info->size - len,
+ "* hw\n"
+ "access name value\n"
+ "R fq_flows_cnt %u\n"
+ "R fq_backlog %u\n"
+ "R fq_overlimit %u\n"
+ "R fq_collisions %u\n"
+ "RW fq_limit %u\n"
+ "RW fq_quantum %u\n",
+ fq->flows_cnt,
+ fq->backlog,
+ fq->overlimit,
+ fq->collisions,
+ fq->limit,
+ fq->quantum);
+
+ len += scnprintf(info->buf + len,
+ info->size - len,
+ "* vif\n"
+ "ifname addr ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ txqi = to_txq_info(sdata->vif.txq);
+ len += scnprintf(info->buf + len, info->size - len,
+ "%s %pM %u %u %u %u %u %u %u %u\n",
+ sdata->name,
+ sdata->vif.addr,
+ txqi->txq.ac,
+ txqi->tin.backlog_bytes,
+ txqi->tin.backlog_packets,
+ txqi->tin.flows,
+ txqi->tin.overlimit,
+ txqi->tin.collisions,
+ txqi->tin.tx_bytes,
+ txqi->tin.tx_packets);
+ }
+
+ len += scnprintf(info->buf + len,
+ info->size - len,
+ "* sta\n"
+ "ifname addr tid ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ sdata = sta->sdata;
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ txqi = to_txq_info(sta->sta.txq[i]);
+ len += scnprintf(info->buf + len, info->size - len,
+ "%s %pM %d %d %u %u %u %u %u %u %u\n",
+ sdata->name,
+ sta->sta.addr,
+ txqi->txq.tid,
+ txqi->txq.ac,
+ txqi->tin.backlog_bytes,
+ txqi->tin.backlog_packets,
+ txqi->tin.flows,
+ txqi->tin.overlimit,
+ txqi->tin.collisions,
+ txqi->tin.tx_bytes,
+ txqi->tin.tx_packets);
+ }
+ }
+
+ info->len = len;
+
+ rcu_read_unlock();
+ spin_unlock_bh(&local->fq.lock);
+
+ return 0;
+}
+
+static int aqm_release(struct inode *inode, struct file *file)
+{
+ vfree(file->private_data);
+ return 0;
+}
+
+static ssize_t aqm_read(struct file *file,
+ char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct aqm_info *info = file->private_data;
+
+ return simple_read_from_buffer(user_buf, count, ppos,
+ info->buf, info->len);
+}
+
+static ssize_t aqm_write(struct file *file,
+ const char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct aqm_info *info = file->private_data;
+ struct ieee80211_local *local = info->local;
+ char buf[100];
+ size_t len;
+
+ if (count > sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ buf[sizeof(buf) - 1] = '\0';
+ len = strlen(buf);
+ if (len > 0 && buf[len-1] == '\n')
+ buf[len-1] = 0;
+
+ if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1)
+ return count;
+ else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1)
+ return count;
+
+ return -EINVAL;
+}
+
+static const struct file_operations aqm_ops = {
+ .write = aqm_write,
+ .read = aqm_read,
+ .open = aqm_open,
+ .release = aqm_release,
+ .llseek = default_llseek,
+};
+
#ifdef CONFIG_PM
static ssize_t reset_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
@@ -256,6 +428,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_ADD(hwflags);
DEBUGFS_ADD(user_power);
DEBUGFS_ADD(power);
+ DEBUGFS_ADD_MODE(aqm, 0600);
statsd = debugfs_create_dir("statistics", phyd);
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 33dfcbc2b..fd334133f 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -328,14 +328,88 @@ STA_OPS(ht_capa);
static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
- char buf[128], *p = buf;
+ char buf[512], *p = buf;
struct sta_info *sta = file->private_data;
struct ieee80211_sta_vht_cap *vhtc = &sta->sta.vht_cap;
p += scnprintf(p, sizeof(buf) + buf - p, "VHT %ssupported\n",
vhtc->vht_supported ? "" : "not ");
if (vhtc->vht_supported) {
- p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.8x\n", vhtc->cap);
+ p += scnprintf(p, sizeof(buf) + buf - p, "cap: %#.8x\n",
+ vhtc->cap);
+#define PFLAG(a, b) \
+ do { \
+ if (vhtc->cap & IEEE80211_VHT_CAP_ ## a) \
+ p += scnprintf(p, sizeof(buf) + buf - p, \
+ "\t\t%s\n", b); \
+ } while (0)
+
+ switch (vhtc->cap & 0x3) {
+ case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895:
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tMAX-MPDU-3895\n");
+ break;
+ case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991:
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tMAX-MPDU-7991\n");
+ break;
+ case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tMAX-MPDU-11454\n");
+ break;
+ default:
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tMAX-MPDU-UNKNOWN\n");
+ };
+ switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
+ case 0:
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\t80Mhz\n");
+ break;
+ case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\t160Mhz\n");
+ break;
+ case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ:
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\t80+80Mhz\n");
+ break;
+ default:
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tUNKNOWN-MHZ: 0x%x\n",
+ (vhtc->cap >> 2) & 0x3);
+ };
+ PFLAG(RXLDPC, "RXLDPC");
+ PFLAG(SHORT_GI_80, "SHORT-GI-80");
+ PFLAG(SHORT_GI_160, "SHORT-GI-160");
+ PFLAG(TXSTBC, "TXSTBC");
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tRXSTBC_%d\n", (vhtc->cap >> 8) & 0x7);
+ PFLAG(SU_BEAMFORMER_CAPABLE, "SU-BEAMFORMER-CAPABLE");
+ PFLAG(SU_BEAMFORMEE_CAPABLE, "SU-BEAMFORMEE-CAPABLE");
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tBEAMFORMEE-STS: 0x%x\n",
+ (vhtc->cap & IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK) >>
+ IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT);
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tSOUNDING-DIMENSIONS: 0x%x\n",
+ (vhtc->cap & IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK)
+ >> IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT);
+ PFLAG(MU_BEAMFORMER_CAPABLE, "MU-BEAMFORMER-CAPABLE");
+ PFLAG(MU_BEAMFORMEE_CAPABLE, "MU-BEAMFORMEE-CAPABLE");
+ PFLAG(VHT_TXOP_PS, "TXOP-PS");
+ PFLAG(HTC_VHT, "HTC-VHT");
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tMPDU-LENGTH-EXPONENT: 0x%x\n",
+ (vhtc->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK) >>
+ IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT);
+ PFLAG(VHT_LINK_ADAPTATION_VHT_UNSOL_MFB,
+ "LINK-ADAPTATION-VHT-UNSOL-MFB");
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "\t\tLINK-ADAPTATION-VHT-MRQ-MFB: 0x%x\n",
+ (vhtc->cap & IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB) >> 26);
+ PFLAG(RX_ANTENNA_PATTERN, "RX-ANTENNA-PATTERN");
+ PFLAG(TX_ANTENNA_PATTERN, "TX-ANTENNA-PATTERN");
p += scnprintf(p, sizeof(buf)+buf-p, "RX MCS: %.4x\n",
le16_to_cpu(vhtc->vht_mcs.rx_mcs_map));
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 184473c25..ba5fc1f01 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1094,7 +1094,7 @@ static inline u32 drv_get_expected_throughput(struct ieee80211_local *local,
trace_drv_get_expected_throughput(sta);
if (local->ops->get_expected_throughput)
- ret = local->ops->get_expected_throughput(sta);
+ ret = local->ops->get_expected_throughput(&local->hw, sta);
trace_drv_return_u32(local, ret);
return ret;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9438c9406..f56d342c3 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -30,6 +30,7 @@
#include <net/ieee80211_radiotap.h>
#include <net/cfg80211.h>
#include <net/mac80211.h>
+#include <net/fq.h>
#include "key.h"
#include "sta_info.h"
#include "debug.h"
@@ -805,10 +806,19 @@ enum txq_info_flags {
IEEE80211_TXQ_NO_AMSDU,
};
+/**
+ * struct txq_info - per tid queue
+ *
+ * @tin: contains packets split into multiple flows
+ * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
+ * a fq_flow which is already owned by a different tin
+ * @def_cvars: codel vars for @def_flow
+ */
struct txq_info {
- struct sk_buff_head queue;
+ struct fq_tin tin;
+ struct fq_flow def_flow;
+ struct codel_vars def_cvars;
unsigned long flags;
- unsigned long byte_cnt;
/* keep last! */
struct ieee80211_txq txq;
@@ -856,7 +866,7 @@ struct ieee80211_sub_if_data {
bool control_port_no_encrypt;
int encrypt_headroom;
- atomic_t txqs_len[IEEE80211_NUM_ACS];
+ atomic_t num_tx_queued;
struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
struct mac80211_qos_map __rcu *qos_map;
@@ -1099,6 +1109,11 @@ struct ieee80211_local {
* it first anyway so they become a no-op */
struct ieee80211_hw hw;
+ struct fq fq;
+ struct codel_vars *cvars;
+ struct codel_params cparams;
+ struct codel_stats cstats;
+
const struct ieee80211_ops *ops;
/*
@@ -1235,6 +1250,7 @@ struct ieee80211_local {
int scan_channel_idx;
int scan_ies_len;
int hw_scan_ies_bufsize;
+ struct cfg80211_scan_info scan_info;
struct work_struct sched_scan_stopped_work;
struct ieee80211_sub_if_data __rcu *sched_scan_sdata;
@@ -1931,9 +1947,13 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
return true;
}
-void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta,
- struct txq_info *txq, int tid);
+int ieee80211_txq_setup_flows(struct ieee80211_local *local);
+void ieee80211_txq_teardown_flows(struct ieee80211_local *local);
+void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct txq_info *txq, int tid);
+void ieee80211_txq_purge(struct ieee80211_local *local,
+ struct txq_info *txqi);
void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
u16 transaction, u16 auth_alg, u16 status,
const u8 *extra, size_t extra_len, const u8 *bssid,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index c59af3eb9..b123a9e32 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -779,6 +779,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
bool going_down)
{
struct ieee80211_local *local = sdata->local;
+ struct fq *fq = &local->fq;
unsigned long flags;
struct sk_buff *skb, *tmp;
u32 hw_reconf_flags = 0;
@@ -977,12 +978,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
if (sdata->vif.txq) {
struct txq_info *txqi = to_txq_info(sdata->vif.txq);
- spin_lock_bh(&txqi->queue.lock);
- ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
- txqi->byte_cnt = 0;
- spin_unlock_bh(&txqi->queue.lock);
-
- atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
+ spin_lock_bh(&fq->lock);
+ ieee80211_txq_purge(local, txqi);
+ spin_unlock_bh(&fq->lock);
}
if (local->open_count == 0)
@@ -1198,6 +1196,12 @@ static void ieee80211_if_setup(struct net_device *dev)
dev->destructor = ieee80211_if_free;
}
+static void ieee80211_if_setup_no_queue(struct net_device *dev)
+{
+ ieee80211_if_setup(dev);
+ dev->priv_flags |= IFF_NO_QUEUE;
+}
+
static void ieee80211_iface_work(struct work_struct *work)
{
struct ieee80211_sub_if_data *sdata =
@@ -1707,6 +1711,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
struct net_device *ndev = NULL;
struct ieee80211_sub_if_data *sdata = NULL;
struct txq_info *txqi;
+ void (*if_setup)(struct net_device *dev);
int ret, i;
int txqs = 1;
@@ -1734,12 +1739,17 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
txq_size += sizeof(struct txq_info) +
local->hw.txq_data_size;
+ if (local->ops->wake_tx_queue)
+ if_setup = ieee80211_if_setup_no_queue;
+ else
+ if_setup = ieee80211_if_setup;
+
if (local->hw.queues >= IEEE80211_NUM_ACS)
txqs = IEEE80211_NUM_ACS;
ndev = alloc_netdev_mqs(size + txq_size,
name, name_assign_type,
- ieee80211_if_setup, txqs, 1);
+ if_setup, txqs, 1);
if (!ndev)
return -ENOMEM;
dev_net_set(ndev, wiphy_net(local->hw.wiphy));
@@ -1780,7 +1790,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
if (txq_size) {
txqi = netdev_priv(ndev) + size;
- ieee80211_init_tx_queue(sdata, NULL, txqi, 0);
+ ieee80211_txq_init(sdata, NULL, txqi, 0);
}
sdata->dev = ndev;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 7ee91d615..d00ea9b13 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1055,9 +1055,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
local->dynamic_ps_forced_timeout = -1;
- if (!local->hw.txq_ac_max_pending)
- local->hw.txq_ac_max_pending = 64;
-
result = ieee80211_wep_init(local);
if (result < 0)
wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
@@ -1089,6 +1086,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
rtnl_unlock();
+ result = ieee80211_txq_setup_flows(local);
+ if (result)
+ goto fail_flows;
+
#ifdef CONFIG_INET
local->ifa_notifier.notifier_call = ieee80211_ifa_changed;
result = register_inetaddr_notifier(&local->ifa_notifier);
@@ -1114,6 +1115,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
#if defined(CONFIG_INET) || defined(CONFIG_IPV6)
fail_ifa:
#endif
+ ieee80211_txq_teardown_flows(local);
+ fail_flows:
rtnl_lock();
rate_control_deinitialize(local);
ieee80211_remove_interfaces(local);
@@ -1172,6 +1175,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
skb_queue_purge(&local->skb_queue);
skb_queue_purge(&local->skb_queue_unreliable);
skb_queue_purge(&local->skb_queue_tdls_chsw);
+ ieee80211_txq_teardown_flows(local);
destroy_workqueue(local->workqueue);
wiphy_unregister(local->hw.wiphy);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 6a1603bcd..42120d965 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -148,25 +148,7 @@ u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata)
void mesh_sta_cleanup(struct sta_info *sta)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
- u32 changed = 0;
-
- /*
- * maybe userspace handles peer allocation and peering, but in either
- * case the beacon is still generated by the kernel and we might need
- * an update.
- */
- if (sdata->u.mesh.user_mpm &&
- sta->mesh->plink_state == NL80211_PLINK_ESTAB)
- changed |= mesh_plink_dec_estab_count(sdata);
- changed |= mesh_accept_plinks_update(sdata);
- if (!sdata->u.mesh.user_mpm) {
- changed |= mesh_plink_deactivate(sta);
- del_timer_sync(&sta->mesh->plink_timer);
- }
-
- /* make sure no readers can access nexthop sta from here on */
- mesh_path_flush_by_nexthop(sta);
- synchronize_net();
+ u32 changed = mesh_plink_deactivate(sta);
if (changed)
ieee80211_mbss_info_change_notify(sdata, changed);
@@ -899,20 +881,22 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
netif_carrier_off(sdata->dev);
+ /* flush STAs and mpaths on this iface */
+ sta_info_flush(sdata);
+ mesh_path_flush_by_iface(sdata);
+
/* stop the beacon */
ifmsh->mesh_id_len = 0;
sdata->vif.bss_conf.enable_beacon = false;
clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
+
+ /* remove beacon */
bcn = rcu_dereference_protected(ifmsh->beacon,
lockdep_is_held(&sdata->wdev.mtx));
RCU_INIT_POINTER(ifmsh->beacon, NULL);
kfree_rcu(bcn, rcu_head);
- /* flush STAs and mpaths on this iface */
- sta_info_flush(sdata);
- mesh_path_flush_by_iface(sdata);
-
/* free all potentially still buffered group-addressed frames */
local->total_ps_buffered -= skb_queue_len(&ifmsh->ps.bc_buf);
skb_queue_purge(&ifmsh->ps.bc_buf);
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 8f9c3bde8..faccef977 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -746,6 +746,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
sta = next_hop_deref_protected(mpath);
if (mpath->flags & MESH_PATH_ACTIVE &&
ether_addr_equal(ta, sta->sta.addr) &&
+ !(mpath->flags & MESH_PATH_FIXED) &&
(!(mpath->flags & MESH_PATH_SN_VALID) ||
SN_GT(target_sn, mpath->sn) || target_sn == 0)) {
mpath->flags &= ~MESH_PATH_ACTIVE;
@@ -1012,7 +1013,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
goto enddiscovery;
spin_lock_bh(&mpath->state_lock);
- if (mpath->flags & MESH_PATH_DELETED) {
+ if (mpath->flags & (MESH_PATH_DELETED | MESH_PATH_FIXED)) {
spin_unlock_bh(&mpath->state_lock);
goto enddiscovery;
}
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 6db2ddfa0..f0e6175a9 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -826,7 +826,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop)
mpath->metric = 0;
mpath->hop_count = 0;
mpath->exp_time = 0;
- mpath->flags |= MESH_PATH_FIXED;
+ mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID;
mesh_path_activate(mpath);
spin_unlock_bh(&mpath->state_lock);
mesh_path_tx_pending(mpath);
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 79f2a0a13..7fcdcf622 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -370,13 +370,21 @@ u32 mesh_plink_deactivate(struct sta_info *sta)
spin_lock_bh(&sta->mesh->plink_lock);
changed = __mesh_plink_deactivate(sta);
- sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED;
- mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE,
- sta->sta.addr, sta->mesh->llid, sta->mesh->plid,
- sta->mesh->reason);
+
+ if (!sdata->u.mesh.user_mpm) {
+ sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED;
+ mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE,
+ sta->sta.addr, sta->mesh->llid,
+ sta->mesh->plid, sta->mesh->reason);
+ }
spin_unlock_bh(&sta->mesh->plink_lock);
+ if (!sdata->u.mesh.user_mpm)
+ del_timer_sync(&sta->mesh->plink_timer);
mesh_path_flush_by_nexthop(sta);
+ /* make sure no readers can access nexthop sta from here on */
+ synchronize_net();
+
return changed;
}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 5e65e8389..9dce3b157 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1268,7 +1268,7 @@ static void sta_ps_start(struct sta_info *sta)
for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
- if (!skb_queue_len(&txqi->queue))
+ if (txqi->tin.backlog_packets)
set_bit(tid, &sta->txq_buffered_tids);
else
clear_bit(tid, &sta->txq_buffered_tids);
@@ -1624,8 +1624,13 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
if (mmie_keyidx < NUM_DEFAULT_KEYS ||
mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
return RX_DROP_MONITOR; /* unexpected BIP keyidx */
- if (rx->sta)
+ if (rx->sta) {
+ if (ieee80211_is_group_privacy_action(skb) &&
+ test_sta_flag(rx->sta, WLAN_STA_MFP))
+ return RX_DROP_MONITOR;
+
rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
+ }
if (!rx->key)
rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
} else if (!ieee80211_has_protected(fc)) {
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index f9648ef9e..070b40f15 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -7,6 +7,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
+ * Copyright 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -70,6 +71,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
.boottime_ns = rx_status->boottime_ns,
};
bool signal_valid;
+ struct ieee80211_sub_if_data *scan_sdata;
if (ieee80211_hw_check(&local->hw, SIGNAL_DBM))
bss_meta.signal = rx_status->signal * 100;
@@ -83,6 +85,20 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10;
bss_meta.chan = channel;
+
+ rcu_read_lock();
+ scan_sdata = rcu_dereference(local->scan_sdata);
+ if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION &&
+ scan_sdata->vif.bss_conf.assoc &&
+ ieee80211_have_rx_timestamp(rx_status)) {
+ bss_meta.parent_tsf =
+ ieee80211_calculate_rx_timestamp(local, rx_status,
+ len + FCS_LEN, 24);
+ ether_addr_copy(bss_meta.parent_bssid,
+ scan_sdata->vif.bss_conf.bssid);
+ }
+ rcu_read_unlock();
+
cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta,
mgmt, len, GFP_ATOMIC);
if (!cbss)
@@ -345,6 +361,12 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
if (rc == 0)
return;
+
+ /* HW scan failed and is going to be reported as aborted,
+ * so clear old scan info.
+ */
+ memset(&local->scan_info, 0, sizeof(local->scan_info));
+ aborted = true;
}
kfree(local->hw_scan_req);
@@ -353,8 +375,10 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
scan_req = rcu_dereference_protected(local->scan_req,
lockdep_is_held(&local->mtx));
- if (scan_req != local->int_scan_req)
- cfg80211_scan_done(scan_req, aborted);
+ if (scan_req != local->int_scan_req) {
+ local->scan_info.aborted = aborted;
+ cfg80211_scan_done(scan_req, &local->scan_info);
+ }
RCU_INIT_POINTER(local->scan_req, NULL);
scan_sdata = rcu_dereference_protected(local->scan_sdata,
@@ -391,15 +415,19 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
ieee80211_start_next_roc(local);
}
-void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+void ieee80211_scan_completed(struct ieee80211_hw *hw,
+ struct cfg80211_scan_info *info)
{
struct ieee80211_local *local = hw_to_local(hw);
- trace_api_scan_completed(local, aborted);
+ trace_api_scan_completed(local, info);
set_bit(SCAN_COMPLETED, &local->scanning);
- if (aborted)
+ if (info->aborted)
set_bit(SCAN_ABORTED, &local->scanning);
+
+ memcpy(&local->scan_info, info, sizeof(*info));
+
ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
}
EXPORT_SYMBOL(ieee80211_scan_completed);
@@ -566,6 +594,9 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
local->hw_scan_req->req.ie = ies;
local->hw_scan_req->req.flags = req->flags;
eth_broadcast_addr(local->hw_scan_req->req.bssid);
+ local->hw_scan_req->req.duration = req->duration;
+ local->hw_scan_req->req.duration_mandatory =
+ req->duration_mandatory;
local->hw_scan_band = 0;
@@ -1073,6 +1104,7 @@ void ieee80211_scan_cancel(struct ieee80211_local *local)
*/
cancel_delayed_work(&local->scan_work);
/* and clean up */
+ memset(&local->scan_info, 0, sizeof(local->scan_info));
__ieee80211_scan_completed(&local->hw, true);
out:
mutex_unlock(&local->mtx);
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 2ddc661f0..97f4c9d6b 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -129,42 +129,31 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
}
if (wide_bw_chansw_ie) {
- new_vht_chandef.chan = new_chan;
- new_vht_chandef.center_freq1 =
- ieee80211_channel_to_frequency(
+ struct ieee80211_vht_operation vht_oper = {
+ .chan_width =
+ wide_bw_chansw_ie->new_channel_width,
+ .center_freq_seg1_idx =
wide_bw_chansw_ie->new_center_freq_seg0,
- new_band);
-
- switch (wide_bw_chansw_ie->new_channel_width) {
- default:
- /* hmmm, ignore VHT and use HT if present */
- case IEEE80211_VHT_CHANWIDTH_USE_HT:
+ .center_freq_seg2_idx =
+ wide_bw_chansw_ie->new_center_freq_seg1,
+ /* .basic_mcs_set doesn't matter */
+ };
+
+ /* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT,
+ * to the previously parsed chandef
+ */
+ new_vht_chandef = csa_ie->chandef;
+
+ /* ignore if parsing fails */
+ if (!ieee80211_chandef_vht_oper(&vht_oper, &new_vht_chandef))
new_vht_chandef.chan = NULL;
- break;
- case IEEE80211_VHT_CHANWIDTH_80MHZ:
- new_vht_chandef.width = NL80211_CHAN_WIDTH_80;
- break;
- case IEEE80211_VHT_CHANWIDTH_160MHZ:
- new_vht_chandef.width = NL80211_CHAN_WIDTH_160;
- break;
- case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
- /* field is otherwise reserved */
- new_vht_chandef.center_freq2 =
- ieee80211_channel_to_frequency(
- wide_bw_chansw_ie->new_center_freq_seg1,
- new_band);
- new_vht_chandef.width = NL80211_CHAN_WIDTH_80P80;
- break;
- }
+
if (sta_flags & IEEE80211_STA_DISABLE_80P80MHZ &&
new_vht_chandef.width == NL80211_CHAN_WIDTH_80P80)
ieee80211_chandef_downgrade(&new_vht_chandef);
if (sta_flags & IEEE80211_STA_DISABLE_160MHZ &&
new_vht_chandef.width == NL80211_CHAN_WIDTH_160)
ieee80211_chandef_downgrade(&new_vht_chandef);
- if (sta_flags & IEEE80211_STA_DISABLE_40MHZ &&
- new_vht_chandef.width > NL80211_CHAN_WIDTH_20)
- ieee80211_chandef_downgrade(&new_vht_chandef);
}
/* if VHT data is there validate & use it */
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 5ccfdbd40..aa58df80e 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -90,6 +90,7 @@ static void __cleanup_single_sta(struct sta_info *sta)
struct tid_ampdu_tx *tid_tx;
struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
+ struct fq *fq = &local->fq;
struct ps_data *ps;
if (test_sta_flag(sta, WLAN_STA_PS_STA) ||
@@ -113,11 +114,10 @@ static void __cleanup_single_sta(struct sta_info *sta)
if (sta->sta.txq[0]) {
for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
- int n = skb_queue_len(&txqi->queue);
- ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
- atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]);
- txqi->byte_cnt = 0;
+ spin_lock_bh(&fq->lock);
+ ieee80211_txq_purge(local, txqi);
+ spin_unlock_bh(&fq->lock);
}
}
@@ -368,7 +368,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
struct txq_info *txq = txq_data + i * size;
- ieee80211_init_tx_queue(sdata, sta, txq, i);
+ ieee80211_txq_init(sdata, sta, txq, i);
}
}
@@ -1211,7 +1211,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
- if (!skb_queue_len(&txqi->queue))
+ if (!txqi->tin.backlog_packets)
continue;
drv_wake_tx_queue(local, txqi);
@@ -1616,7 +1616,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
sta_info_recalc_tim(sta);
} else {
- unsigned long tids = sta->txq_buffered_tids & driver_release_tids;
int tid;
/*
@@ -1648,7 +1647,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
- if (!(tids & BIT(tid)) || skb_queue_len(&txqi->queue))
+ if (!(driver_release_tids & BIT(tid)) ||
+ txqi->tin.backlog_packets)
continue;
sta_info_recalc_tim(sta);
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index c6d5c724e..a2a682696 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -771,6 +771,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
clear_sta_flag(sta, WLAN_STA_SP);
acked = !!(info->flags & IEEE80211_TX_STAT_ACK);
+
+ /* mesh Peer Service Period support */
+ if (ieee80211_vif_is_mesh(&sta->sdata->vif) &&
+ ieee80211_is_data_qos(fc))
+ ieee80211_mpsp_trigger_process(
+ ieee80211_get_qos_ctl(hdr), sta, true, acked);
+
if (!acked && test_sta_flag(sta, WLAN_STA_PS_STA)) {
/*
* The STA is in power save mode, so assume
@@ -781,13 +788,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
return;
}
- /* mesh Peer Service Period support */
- if (ieee80211_vif_is_mesh(&sta->sdata->vif) &&
- ieee80211_is_data_qos(fc))
- ieee80211_mpsp_trigger_process(
- ieee80211_get_qos_ctl(hdr),
- sta, true, acked);
-
if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) &&
(ieee80211_is_data(hdr->frame_control)) &&
(rates_idx != -1))
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 1c7d45a6d..afca7d103 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -333,10 +333,11 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
if (!uc.center_freq1)
return;
- /* proceed to downgrade the chandef until usable or the same */
+ /* proceed to downgrade the chandef until usable or the same as AP BW */
while (uc.width > max_width ||
- !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc,
- sdata->wdev.iftype))
+ (uc.width > sta->tdls_chandef.width &&
+ !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc,
+ sdata->wdev.iftype)))
ieee80211_chandef_downgrade(&uc);
if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) {
@@ -1747,6 +1748,7 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata,
goto out;
}
+ ret = 0;
call_drv:
drv_tdls_recv_channel_switch(sdata->local, sdata, &params);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8bad2ad81..18b285e06 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -24,7 +24,10 @@
#include <net/ieee80211_radiotap.h>
#include <net/cfg80211.h>
#include <net/mac80211.h>
+#include <net/codel.h>
+#include <net/codel_impl.h>
#include <asm/unaligned.h>
+#include <net/fq_impl.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
@@ -590,6 +593,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
else if (tx->sta &&
(key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
tx->key = key;
+ else if (ieee80211_is_group_privacy_action(tx->skb) &&
+ (key = rcu_dereference(tx->sdata->default_multicast_key)))
+ tx->key = key;
else if (ieee80211_is_mgmt(hdr->frame_control) &&
is_multicast_ether_addr(hdr->addr1) &&
ieee80211_is_robust_mgmt_frame(tx->skb) &&
@@ -622,7 +628,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
case WLAN_CIPHER_SUITE_GCMP_256:
if (!ieee80211_is_data_present(hdr->frame_control) &&
!ieee80211_use_mfp(hdr->frame_control, tx->sta,
- tx->skb))
+ tx->skb) &&
+ !ieee80211_is_group_privacy_action(tx->skb))
tx->key = NULL;
else
skip_hw = (tx->key->conf.flags &
@@ -789,6 +796,36 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
return ret;
}
+static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *pubsta,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_txq *txq = NULL;
+
+ if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
+ (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
+ return NULL;
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ return NULL;
+
+ if (pubsta) {
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
+
+ txq = pubsta->txq[tid];
+ } else if (vif) {
+ txq = vif->txq;
+ }
+
+ if (!txq)
+ return NULL;
+
+ return to_txq_info(txq);
+}
+
static ieee80211_tx_result debug_noinline
ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
{
@@ -846,7 +883,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
tx->sta->tx_stats.msdu[tid]++;
- if (!tx->sta->sta.txq[0])
+ if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta,
+ tx->skb))
hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
return TX_CONTINUE;
@@ -1236,82 +1274,229 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
return TX_CONTINUE;
}
-static void ieee80211_drv_tx(struct ieee80211_local *local,
- struct ieee80211_vif *vif,
- struct ieee80211_sta *pubsta,
+static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
+{
+ IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
+}
+
+static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi)
+{
+ IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif;
+}
+
+static u32 codel_skb_len_func(const struct sk_buff *skb)
+{
+ return skb->len;
+}
+
+static codel_time_t codel_skb_time_func(const struct sk_buff *skb)
+{
+ const struct ieee80211_tx_info *info;
+
+ info = (const struct ieee80211_tx_info *)skb->cb;
+ return info->control.enqueue_time;
+}
+
+static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars,
+ void *ctx)
+{
+ struct ieee80211_local *local;
+ struct txq_info *txqi;
+ struct fq *fq;
+ struct fq_flow *flow;
+
+ txqi = ctx;
+ local = vif_to_sdata(txqi->txq.vif)->local;
+ fq = &local->fq;
+
+ if (cvars == &txqi->def_cvars)
+ flow = &txqi->def_flow;
+ else
+ flow = &fq->flows[cvars - local->cvars];
+
+ return fq_flow_dequeue(fq, flow);
+}
+
+static void codel_drop_func(struct sk_buff *skb,
+ void *ctx)
+{
+ struct ieee80211_local *local;
+ struct ieee80211_hw *hw;
+ struct txq_info *txqi;
+
+ txqi = ctx;
+ local = vif_to_sdata(txqi->txq.vif)->local;
+ hw = &local->hw;
+
+ ieee80211_free_txskb(hw, skb);
+}
+
+static struct sk_buff *fq_tin_dequeue_func(struct fq *fq,
+ struct fq_tin *tin,
+ struct fq_flow *flow)
+{
+ struct ieee80211_local *local;
+ struct txq_info *txqi;
+ struct codel_vars *cvars;
+ struct codel_params *cparams;
+ struct codel_stats *cstats;
+
+ local = container_of(fq, struct ieee80211_local, fq);
+ txqi = container_of(tin, struct txq_info, tin);
+ cparams = &local->cparams;
+ cstats = &local->cstats;
+
+ if (flow == &txqi->def_flow)
+ cvars = &txqi->def_cvars;
+ else
+ cvars = &local->cvars[flow - fq->flows];
+
+ return codel_dequeue(txqi,
+ &flow->backlog,
+ cparams,
+ cvars,
+ cstats,
+ codel_skb_len_func,
+ codel_skb_time_func,
+ codel_drop_func,
+ codel_dequeue_func);
+}
+
+static void fq_skb_free_func(struct fq *fq,
+ struct fq_tin *tin,
+ struct fq_flow *flow,
struct sk_buff *skb)
{
- struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
- struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
- struct ieee80211_tx_control control = {
- .sta = pubsta,
- };
- struct ieee80211_txq *txq = NULL;
+ struct ieee80211_local *local;
+
+ local = container_of(fq, struct ieee80211_local, fq);
+ ieee80211_free_txskb(&local->hw, skb);
+}
+
+static struct fq_flow *fq_flow_get_default_func(struct fq *fq,
+ struct fq_tin *tin,
+ int idx,
+ struct sk_buff *skb)
+{
struct txq_info *txqi;
- u8 ac;
- if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
- (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
- goto tx_normal;
+ txqi = container_of(tin, struct txq_info, tin);
+ return &txqi->def_flow;
+}
- if (!ieee80211_is_data(hdr->frame_control))
- goto tx_normal;
+static void ieee80211_txq_enqueue(struct ieee80211_local *local,
+ struct txq_info *txqi,
+ struct sk_buff *skb)
+{
+ struct fq *fq = &local->fq;
+ struct fq_tin *tin = &txqi->tin;
- if (pubsta) {
- u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
+ ieee80211_set_skb_enqueue_time(skb);
+ fq_tin_enqueue(fq, tin, skb,
+ fq_skb_free_func,
+ fq_flow_get_default_func);
+}
- txq = pubsta->txq[tid];
- } else if (vif) {
- txq = vif->txq;
+void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct txq_info *txqi, int tid)
+{
+ fq_tin_init(&txqi->tin);
+ fq_flow_init(&txqi->def_flow);
+ codel_vars_init(&txqi->def_cvars);
+
+ txqi->txq.vif = &sdata->vif;
+
+ if (sta) {
+ txqi->txq.sta = &sta->sta;
+ sta->sta.txq[tid] = &txqi->txq;
+ txqi->txq.tid = tid;
+ txqi->txq.ac = ieee802_1d_to_ac[tid & 7];
+ } else {
+ sdata->vif.txq = &txqi->txq;
+ txqi->txq.tid = 0;
+ txqi->txq.ac = IEEE80211_AC_BE;
}
+}
- if (!txq)
- goto tx_normal;
+void ieee80211_txq_purge(struct ieee80211_local *local,
+ struct txq_info *txqi)
+{
+ struct fq *fq = &local->fq;
+ struct fq_tin *tin = &txqi->tin;
- ac = txq->ac;
- txqi = to_txq_info(txq);
- atomic_inc(&sdata->txqs_len[ac]);
- if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending)
- netif_stop_subqueue(sdata->dev, ac);
+ fq_tin_reset(fq, tin, fq_skb_free_func);
+}
+
+int ieee80211_txq_setup_flows(struct ieee80211_local *local)
+{
+ struct fq *fq = &local->fq;
+ int ret;
+ int i;
+
+ if (!local->ops->wake_tx_queue)
+ return 0;
+
+ ret = fq_init(fq, 4096);
+ if (ret)
+ return ret;
+
+ codel_params_init(&local->cparams);
+ codel_stats_init(&local->cstats);
+ local->cparams.interval = MS2TIME(100);
+ local->cparams.target = MS2TIME(20);
+ local->cparams.ecn = true;
+
+ local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]),
+ GFP_KERNEL);
+ if (!local->cvars) {
+ spin_lock_bh(&fq->lock);
+ fq_reset(fq, fq_skb_free_func);
+ spin_unlock_bh(&fq->lock);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < fq->flows_cnt; i++)
+ codel_vars_init(&local->cvars[i]);
- spin_lock_bh(&txqi->queue.lock);
- txqi->byte_cnt += skb->len;
- __skb_queue_tail(&txqi->queue, skb);
- spin_unlock_bh(&txqi->queue.lock);
+ return 0;
+}
+
+void ieee80211_txq_teardown_flows(struct ieee80211_local *local)
+{
+ struct fq *fq = &local->fq;
- drv_wake_tx_queue(local, txqi);
+ if (!local->ops->wake_tx_queue)
+ return;
- return;
+ kfree(local->cvars);
+ local->cvars = NULL;
-tx_normal:
- drv_tx(local, &control, skb);
+ spin_lock_bh(&fq->lock);
+ fq_reset(fq, fq_skb_free_func);
+ spin_unlock_bh(&fq->lock);
}
struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
struct ieee80211_txq *txq)
{
struct ieee80211_local *local = hw_to_local(hw);
- struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif);
struct txq_info *txqi = container_of(txq, struct txq_info, txq);
struct ieee80211_hdr *hdr;
struct sk_buff *skb = NULL;
- u8 ac = txq->ac;
+ struct fq *fq = &local->fq;
+ struct fq_tin *tin = &txqi->tin;
- spin_lock_bh(&txqi->queue.lock);
+ spin_lock_bh(&fq->lock);
if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
goto out;
- skb = __skb_dequeue(&txqi->queue);
+ skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
if (!skb)
goto out;
- txqi->byte_cnt -= skb->len;
-
- atomic_dec(&sdata->txqs_len[ac]);
- if (__netif_subqueue_stopped(sdata->dev, ac))
- ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]);
+ ieee80211_set_skb_vif(skb, txqi);
hdr = (struct ieee80211_hdr *)skb->data;
if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
@@ -1327,11 +1512,15 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
}
out:
- spin_unlock_bh(&txqi->queue.lock);
+ spin_unlock_bh(&fq->lock);
if (skb && skb_has_frag_list(skb) &&
- !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
- skb_linearize(skb);
+ !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
+ if (skb_linearize(skb)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ return NULL;
+ }
+ }
return skb;
}
@@ -1343,7 +1532,10 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
struct sk_buff_head *skbs,
bool txpending)
{
+ struct ieee80211_tx_control control = {};
+ struct fq *fq = &local->fq;
struct sk_buff *skb, *tmp;
+ struct txq_info *txqi;
unsigned long flags;
skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1358,6 +1550,21 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
}
#endif
+ txqi = ieee80211_get_txq(local, vif, sta, skb);
+ if (txqi) {
+ info->control.vif = vif;
+
+ __skb_unlink(skb, skbs);
+
+ spin_lock_bh(&fq->lock);
+ ieee80211_txq_enqueue(local, txqi, skb);
+ spin_unlock_bh(&fq->lock);
+
+ drv_wake_tx_queue(local, txqi);
+
+ continue;
+ }
+
spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
if (local->queue_stop_reasons[q] ||
(!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1400,9 +1607,10 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
info->control.vif = vif;
+ control.sta = sta;
__skb_unlink(skb, skbs);
- ieee80211_drv_tx(local, vif, sta, skb);
+ drv_tx(local, &control, skb);
}
return true;
@@ -2882,6 +3090,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb)
{
struct ieee80211_local *local = sdata->local;
+ struct fq *fq = &local->fq;
+ struct fq_tin *tin;
+ struct fq_flow *flow;
u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
struct ieee80211_txq *txq = sta->sta.txq[tid];
struct txq_info *txqi;
@@ -2893,6 +3104,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
__be16 len;
void *data;
bool ret = false;
+ unsigned int orig_len;
int n = 1, nfrags;
if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
@@ -2909,12 +3121,20 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
max_amsdu_len = min_t(int, max_amsdu_len,
sta->sta.max_rc_amsdu_len);
- spin_lock_bh(&txqi->queue.lock);
+ spin_lock_bh(&fq->lock);
+
+ /* TODO: Ideally aggregation should be done on dequeue to remain
+ * responsive to environment changes.
+ */
- head = skb_peek_tail(&txqi->queue);
+ tin = &txqi->tin;
+ flow = fq_flow_classify(fq, tin, skb, fq_flow_get_default_func);
+ head = skb_peek_tail(&flow->queue);
if (!head)
goto out;
+ orig_len = head->len;
+
if (skb->len + head->len > max_amsdu_len)
goto out;
@@ -2953,8 +3173,13 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
head->data_len += skb->len;
*frag_tail = skb;
+ flow->backlog += head->len - orig_len;
+ tin->backlog_bytes += head->len - orig_len;
+
+ fq_recalc_backlog(fq, tin, flow);
+
out:
- spin_unlock_bh(&txqi->queue.lock);
+ spin_unlock_bh(&fq->lock);
return ret;
}
@@ -3044,7 +3269,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
*ieee80211_get_qos_ctl(hdr) = tid;
- if (!sta->sta.txq[0])
+ if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb))
hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
} else {
info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 905003f75..42bf0b668 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -244,6 +244,9 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
struct ieee80211_sub_if_data *sdata;
int n_acs = IEEE80211_NUM_ACS;
+ if (local->ops->wake_tx_queue)
+ return;
+
if (local->hw.queues < IEEE80211_NUM_ACS)
n_acs = 1;
@@ -260,11 +263,6 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
for (ac = 0; ac < n_acs; ac++) {
int ac_queue = sdata->vif.hw_queue[ac];
- if (local->ops->wake_tx_queue &&
- (atomic_read(&sdata->txqs_len[ac]) >
- local->hw.txq_ac_max_pending))
- continue;
-
if (ac_queue == queue ||
(sdata->vif.cab_queue == queue &&
local->queue_stop_reasons[ac_queue] == 0 &&
@@ -352,6 +350,9 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue]))
return;
+ if (local->ops->wake_tx_queue)
+ return;
+
if (local->hw.queues < IEEE80211_NUM_ACS)
n_acs = 1;
@@ -3388,25 +3389,6 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo)
return buf;
}
-void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta,
- struct txq_info *txqi, int tid)
-{
- skb_queue_head_init(&txqi->queue);
- txqi->txq.vif = &sdata->vif;
-
- if (sta) {
- txqi->txq.sta = &sta->sta;
- sta->sta.txq[tid] = &txqi->txq;
- txqi->txq.tid = tid;
- txqi->txq.ac = ieee802_1d_to_ac[tid & 7];
- } else {
- sdata->vif.txq = &txqi->txq;
- txqi->txq.tid = 0;
- txqi->txq.ac = IEEE80211_AC_BE;
- }
-}
-
void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
unsigned long *frame_cnt,
unsigned long *byte_cnt)
@@ -3414,9 +3396,9 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
struct txq_info *txqi = to_txq_info(txq);
if (frame_cnt)
- *frame_cnt = txqi->queue.qlen;
+ *frame_cnt = txqi->tin.backlog_packets;
if (byte_cnt)
- *byte_cnt = txqi->byte_cnt;
+ *byte_cnt = txqi->tin.backlog_bytes;
}
EXPORT_SYMBOL(ieee80211_txq_get_depth);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 0b80a7140..5c161e775 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -91,7 +91,7 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
if (skb->len <= mtu)
return false;
- if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
return false;
return true;
@@ -1009,9 +1009,12 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
unsigned int flags;
if (event == NETDEV_REGISTER) {
- /* For now just support ethernet devices */
- if ((dev->type == ARPHRD_ETHER) ||
- (dev->type == ARPHRD_LOOPBACK)) {
+ /* For now just support Ethernet, IPGRE, SIT and IPIP devices */
+ if (dev->type == ARPHRD_ETHER ||
+ dev->type == ARPHRD_LOOPBACK ||
+ dev->type == ARPHRD_IPGRE ||
+ dev->type == ARPHRD_SIT ||
+ dev->type == ARPHRD_TUNNEL) {
mdev = mpls_add_dev(dev);
if (IS_ERR(mdev))
return notifier_from_errno(PTR_ERR(mdev));
diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig
new file mode 100644
index 000000000..08a8a6031
--- /dev/null
+++ b/net/ncsi/Kconfig
@@ -0,0 +1,12 @@
+#
+# Configuration for NCSI support
+#
+
+config NET_NCSI
+ bool "NCSI interface support"
+ depends on INET
+ ---help---
+ This module provides NCSI (Network Controller Sideband Interface)
+ support. Enable this only if your system connects to a network
+ device via NCSI and the ethernet driver you're using supports
+ the protocol explicitly.
diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile
new file mode 100644
index 000000000..dd12b564f
--- /dev/null
+++ b/net/ncsi/Makefile
@@ -0,0 +1,4 @@
+#
+# Makefile for NCSI API
+#
+obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
new file mode 100644
index 000000000..33738c060
--- /dev/null
+++ b/net/ncsi/internal.h
@@ -0,0 +1,328 @@
+/*
+ * Copyright Gavin Shan, IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NCSI_INTERNAL_H__
+#define __NCSI_INTERNAL_H__
+
+enum {
+ NCSI_CAP_BASE = 0,
+ NCSI_CAP_GENERIC = 0,
+ NCSI_CAP_BC,
+ NCSI_CAP_MC,
+ NCSI_CAP_BUFFER,
+ NCSI_CAP_AEN,
+ NCSI_CAP_VLAN,
+ NCSI_CAP_MAX
+};
+
+enum {
+ NCSI_CAP_GENERIC_HWA = 0x01, /* HW arbitration */
+ NCSI_CAP_GENERIC_HDS = 0x02, /* HNC driver status change */
+ NCSI_CAP_GENERIC_FC = 0x04, /* HNC to MC flow control */
+ NCSI_CAP_GENERIC_FC1 = 0x08, /* MC to HNC flow control */
+ NCSI_CAP_GENERIC_MC = 0x10, /* Global MC filtering */
+ NCSI_CAP_GENERIC_HWA_UNKNOWN = 0x00, /* Unknown HW arbitration */
+ NCSI_CAP_GENERIC_HWA_SUPPORT = 0x20, /* Supported HW arbitration */
+ NCSI_CAP_GENERIC_HWA_NOT_SUPPORT = 0x40, /* No HW arbitration */
+ NCSI_CAP_GENERIC_HWA_RESERVED = 0x60, /* Reserved HW arbitration */
+ NCSI_CAP_GENERIC_HWA_MASK = 0x60, /* Mask for HW arbitration */
+ NCSI_CAP_GENERIC_MASK = 0x7f,
+ NCSI_CAP_BC_ARP = 0x01, /* ARP packet filtering */
+ NCSI_CAP_BC_DHCPC = 0x02, /* DHCP client filtering */
+ NCSI_CAP_BC_DHCPS = 0x04, /* DHCP server filtering */
+ NCSI_CAP_BC_NETBIOS = 0x08, /* NetBIOS packet filtering */
+ NCSI_CAP_BC_MASK = 0x0f,
+ NCSI_CAP_MC_IPV6_NEIGHBOR = 0x01, /* IPv6 neighbor filtering */
+ NCSI_CAP_MC_IPV6_ROUTER = 0x02, /* IPv6 router filering */
+ NCSI_CAP_MC_DHCPV6_RELAY = 0x04, /* DHCPv6 relay / server MC */
+ NCSI_CAP_MC_DHCPV6_WELL_KNOWN = 0x08, /* DHCPv6 well-known MC */
+ NCSI_CAP_MC_IPV6_MLD = 0x10, /* IPv6 MLD filtering */
+ NCSI_CAP_MC_IPV6_NEIGHBOR_S = 0x20, /* IPv6 neighbour filtering */
+ NCSI_CAP_MC_MASK = 0x3f,
+ NCSI_CAP_AEN_LSC = 0x01, /* Link status change */
+ NCSI_CAP_AEN_CR = 0x02, /* Configuration required */
+ NCSI_CAP_AEN_HDS = 0x04, /* HNC driver status */
+ NCSI_CAP_AEN_MASK = 0x07,
+ NCSI_CAP_VLAN_ONLY = 0x01, /* Filter VLAN packet only */
+ NCSI_CAP_VLAN_NO = 0x02, /* Filter VLAN and non-VLAN */
+ NCSI_CAP_VLAN_ANY = 0x04, /* Filter Any-and-non-VLAN */
+ NCSI_CAP_VLAN_MASK = 0x07
+};
+
+enum {
+ NCSI_MODE_BASE = 0,
+ NCSI_MODE_ENABLE = 0,
+ NCSI_MODE_TX_ENABLE,
+ NCSI_MODE_LINK,
+ NCSI_MODE_VLAN,
+ NCSI_MODE_BC,
+ NCSI_MODE_MC,
+ NCSI_MODE_AEN,
+ NCSI_MODE_FC,
+ NCSI_MODE_MAX
+};
+
+enum {
+ NCSI_FILTER_BASE = 0,
+ NCSI_FILTER_VLAN = 0,
+ NCSI_FILTER_UC,
+ NCSI_FILTER_MC,
+ NCSI_FILTER_MIXED,
+ NCSI_FILTER_MAX
+};
+
+struct ncsi_channel_version {
+ u32 version; /* Supported BCD encoded NCSI version */
+ u32 alpha2; /* Supported BCD encoded NCSI version */
+ u8 fw_name[12]; /* Firware name string */
+ u32 fw_version; /* Firmware version */
+ u16 pci_ids[4]; /* PCI identification */
+ u32 mf_id; /* Manufacture ID */
+};
+
+struct ncsi_channel_cap {
+ u32 index; /* Index of channel capabilities */
+ u32 cap; /* NCSI channel capability */
+};
+
+struct ncsi_channel_mode {
+ u32 index; /* Index of channel modes */
+ u32 enable; /* Enabled or disabled */
+ u32 size; /* Valid entries in ncm_data[] */
+ u32 data[8]; /* Data entries */
+};
+
+struct ncsi_channel_filter {
+ u32 index; /* Index of channel filters */
+ u32 total; /* Total entries in the filter table */
+ u64 bitmap; /* Bitmap of valid entries */
+ u32 data[]; /* Data for the valid entries */
+};
+
+struct ncsi_channel_stats {
+ u32 hnc_cnt_hi; /* Counter cleared */
+ u32 hnc_cnt_lo; /* Counter cleared */
+ u32 hnc_rx_bytes; /* Rx bytes */
+ u32 hnc_tx_bytes; /* Tx bytes */
+ u32 hnc_rx_uc_pkts; /* Rx UC packets */
+ u32 hnc_rx_mc_pkts; /* Rx MC packets */
+ u32 hnc_rx_bc_pkts; /* Rx BC packets */
+ u32 hnc_tx_uc_pkts; /* Tx UC packets */
+ u32 hnc_tx_mc_pkts; /* Tx MC packets */
+ u32 hnc_tx_bc_pkts; /* Tx BC packets */
+ u32 hnc_fcs_err; /* FCS errors */
+ u32 hnc_align_err; /* Alignment errors */
+ u32 hnc_false_carrier; /* False carrier detection */
+ u32 hnc_runt_pkts; /* Rx runt packets */
+ u32 hnc_jabber_pkts; /* Rx jabber packets */
+ u32 hnc_rx_pause_xon; /* Rx pause XON frames */
+ u32 hnc_rx_pause_xoff; /* Rx XOFF frames */
+ u32 hnc_tx_pause_xon; /* Tx XON frames */
+ u32 hnc_tx_pause_xoff; /* Tx XOFF frames */
+ u32 hnc_tx_s_collision; /* Single collision frames */
+ u32 hnc_tx_m_collision; /* Multiple collision frames */
+ u32 hnc_l_collision; /* Late collision frames */
+ u32 hnc_e_collision; /* Excessive collision frames */
+ u32 hnc_rx_ctl_frames; /* Rx control frames */
+ u32 hnc_rx_64_frames; /* Rx 64-bytes frames */
+ u32 hnc_rx_127_frames; /* Rx 65-127 bytes frames */
+ u32 hnc_rx_255_frames; /* Rx 128-255 bytes frames */
+ u32 hnc_rx_511_frames; /* Rx 256-511 bytes frames */
+ u32 hnc_rx_1023_frames; /* Rx 512-1023 bytes frames */
+ u32 hnc_rx_1522_frames; /* Rx 1024-1522 bytes frames */
+ u32 hnc_rx_9022_frames; /* Rx 1523-9022 bytes frames */
+ u32 hnc_tx_64_frames; /* Tx 64-bytes frames */
+ u32 hnc_tx_127_frames; /* Tx 65-127 bytes frames */
+ u32 hnc_tx_255_frames; /* Tx 128-255 bytes frames */
+ u32 hnc_tx_511_frames; /* Tx 256-511 bytes frames */
+ u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */
+ u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */
+ u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */
+ u32 hnc_rx_valid_bytes; /* Rx valid bytes */
+ u32 hnc_rx_runt_pkts; /* Rx error runt packets */
+ u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */
+ u32 ncsi_rx_cmds; /* Rx NCSI commands */
+ u32 ncsi_dropped_cmds; /* Dropped commands */
+ u32 ncsi_cmd_type_errs; /* Command type errors */
+ u32 ncsi_cmd_csum_errs; /* Command checksum errors */
+ u32 ncsi_rx_pkts; /* Rx NCSI packets */
+ u32 ncsi_tx_pkts; /* Tx NCSI packets */
+ u32 ncsi_tx_aen_pkts; /* Tx AEN packets */
+ u32 pt_tx_pkts; /* Tx packets */
+ u32 pt_tx_dropped; /* Tx dropped packets */
+ u32 pt_tx_channel_err; /* Tx channel errors */
+ u32 pt_tx_us_err; /* Tx undersize errors */
+ u32 pt_rx_pkts; /* Rx packets */
+ u32 pt_rx_dropped; /* Rx dropped packets */
+ u32 pt_rx_channel_err; /* Rx channel errors */
+ u32 pt_rx_us_err; /* Rx undersize errors */
+ u32 pt_rx_os_err; /* Rx oversize errors */
+};
+
+struct ncsi_dev_priv;
+struct ncsi_package;
+
+#define NCSI_PACKAGE_SHIFT 5
+#define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7)
+#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
+#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c))
+
+struct ncsi_channel {
+ unsigned char id;
+ int state;
+#define NCSI_CHANNEL_INACTIVE 1
+#define NCSI_CHANNEL_ACTIVE 2
+#define NCSI_CHANNEL_INVISIBLE 3
+ spinlock_t lock; /* Protect filters etc */
+ struct ncsi_package *package;
+ struct ncsi_channel_version version;
+ struct ncsi_channel_cap caps[NCSI_CAP_MAX];
+ struct ncsi_channel_mode modes[NCSI_MODE_MAX];
+ struct ncsi_channel_filter *filters[NCSI_FILTER_MAX];
+ struct ncsi_channel_stats stats;
+ struct timer_list timer; /* Link monitor timer */
+ bool enabled; /* Timer is enabled */
+ unsigned int timeout; /* Times of timeout */
+ struct list_head node;
+ struct list_head link;
+};
+
+struct ncsi_package {
+ unsigned char id; /* NCSI 3-bits package ID */
+ unsigned char uuid[16]; /* UUID */
+ struct ncsi_dev_priv *ndp; /* NCSI device */
+ spinlock_t lock; /* Protect the package */
+ unsigned int channel_num; /* Number of channels */
+ struct list_head channels; /* List of chanels */
+ struct list_head node; /* Form list of packages */
+};
+
+struct ncsi_request {
+ unsigned char id; /* Request ID - 0 to 255 */
+ bool used; /* Request that has been assigned */
+ bool driven; /* Drive state machine */
+ struct ncsi_dev_priv *ndp; /* Associated NCSI device */
+ struct sk_buff *cmd; /* Associated NCSI command packet */
+ struct sk_buff *rsp; /* Associated NCSI response packet */
+ struct timer_list timer; /* Timer on waiting for response */
+ bool enabled; /* Time has been enabled or not */
+};
+
+enum {
+ ncsi_dev_state_major = 0xff00,
+ ncsi_dev_state_minor = 0x00ff,
+ ncsi_dev_state_probe_deselect = 0x0201,
+ ncsi_dev_state_probe_package,
+ ncsi_dev_state_probe_channel,
+ ncsi_dev_state_probe_cis,
+ ncsi_dev_state_probe_gvi,
+ ncsi_dev_state_probe_gc,
+ ncsi_dev_state_probe_gls,
+ ncsi_dev_state_probe_dp,
+ ncsi_dev_state_config_sp = 0x0301,
+ ncsi_dev_state_config_cis,
+ ncsi_dev_state_config_sma,
+ ncsi_dev_state_config_ebf,
+#if IS_ENABLED(CONFIG_IPV6)
+ ncsi_dev_state_config_egmf,
+#endif
+ ncsi_dev_state_config_ecnt,
+ ncsi_dev_state_config_ec,
+ ncsi_dev_state_config_ae,
+ ncsi_dev_state_config_gls,
+ ncsi_dev_state_config_done,
+ ncsi_dev_state_suspend_select = 0x0401,
+ ncsi_dev_state_suspend_dcnt,
+ ncsi_dev_state_suspend_dc,
+ ncsi_dev_state_suspend_deselect,
+ ncsi_dev_state_suspend_done
+};
+
+struct ncsi_dev_priv {
+ struct ncsi_dev ndev; /* Associated NCSI device */
+ unsigned int flags; /* NCSI device flags */
+#define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */
+#define NCSI_DEV_HWA 2 /* Enabled HW arbitration */
+#define NCSI_DEV_RESHUFFLE 4
+ spinlock_t lock; /* Protect the NCSI device */
+#if IS_ENABLED(CONFIG_IPV6)
+ unsigned int inet6_addr_num; /* Number of IPv6 addresses */
+#endif
+ unsigned int package_num; /* Number of packages */
+ struct list_head packages; /* List of packages */
+ struct ncsi_request requests[256]; /* Request table */
+ unsigned int request_id; /* Last used request ID */
+ unsigned int pending_req_num; /* Number of pending requests */
+ struct ncsi_package *active_package; /* Currently handled package */
+ struct ncsi_channel *active_channel; /* Currently handled channel */
+ struct list_head channel_queue; /* Config queue of channels */
+ struct work_struct work; /* For channel management */
+ struct packet_type ptype; /* NCSI packet Rx handler */
+ struct list_head node; /* Form NCSI device list */
+};
+
+struct ncsi_cmd_arg {
+ struct ncsi_dev_priv *ndp; /* Associated NCSI device */
+ unsigned char type; /* Command in the NCSI packet */
+ unsigned char id; /* Request ID (sequence number) */
+ unsigned char package; /* Destination package ID */
+ unsigned char channel; /* Detination channel ID or 0x1f */
+ unsigned short payload; /* Command packet payload length */
+ bool driven; /* Drive the state machine? */
+ union {
+ unsigned char bytes[16]; /* Command packet specific data */
+ unsigned short words[8];
+ unsigned int dwords[4];
+ };
+};
+
+extern struct list_head ncsi_dev_list;
+extern spinlock_t ncsi_dev_lock;
+
+#define TO_NCSI_DEV_PRIV(nd) \
+ container_of(nd, struct ncsi_dev_priv, ndev)
+#define NCSI_FOR_EACH_DEV(ndp) \
+ list_for_each_entry_rcu(ndp, &ncsi_dev_list, node)
+#define NCSI_FOR_EACH_PACKAGE(ndp, np) \
+ list_for_each_entry_rcu(np, &ndp->packages, node)
+#define NCSI_FOR_EACH_CHANNEL(np, nc) \
+ list_for_each_entry_rcu(nc, &np->channels, node)
+
+/* Resources */
+int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data);
+int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data);
+int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index);
+void ncsi_start_channel_monitor(struct ncsi_channel *nc);
+void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
+struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
+ unsigned char id);
+struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np,
+ unsigned char id);
+struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp,
+ unsigned char id);
+struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
+ unsigned char id);
+void ncsi_remove_package(struct ncsi_package *np);
+void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
+ unsigned char id,
+ struct ncsi_package **np,
+ struct ncsi_channel **nc);
+struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven);
+void ncsi_free_request(struct ncsi_request *nr);
+struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
+int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
+
+/* Packet handlers */
+u32 ncsi_calculate_checksum(unsigned char *data, int len);
+int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca);
+int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
+int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb);
+
+#endif /* __NCSI_INTERNAL_H__ */
diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
new file mode 100644
index 000000000..d46346844
--- /dev/null
+++ b/net/ncsi/ncsi-aen.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright Gavin Shan, IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/ncsi.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "internal.h"
+#include "ncsi-pkt.h"
+
+static int ncsi_validate_aen_pkt(struct ncsi_aen_pkt_hdr *h,
+ const unsigned short payload)
+{
+ u32 checksum;
+ __be32 *pchecksum;
+
+ if (h->common.revision != NCSI_PKT_REVISION)
+ return -EINVAL;
+ if (ntohs(h->common.length) != payload)
+ return -EINVAL;
+
+ /* Validate checksum, which might be zeroes if the
+ * sender doesn't support checksum according to NCSI
+ * specification.
+ */
+ pchecksum = (__be32 *)((void *)(h + 1) + payload - 4);
+ if (ntohl(*pchecksum) == 0)
+ return 0;
+
+ checksum = ncsi_calculate_checksum((unsigned char *)h,
+ sizeof(*h) + payload - 4);
+ if (*pchecksum != htonl(checksum))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
+ struct ncsi_aen_pkt_hdr *h)
+{
+ struct ncsi_aen_lsc_pkt *lsc;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+ unsigned long old_data;
+ unsigned long flags;
+
+ /* Find the NCSI channel */
+ ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Update the link status */
+ ncm = &nc->modes[NCSI_MODE_LINK];
+ lsc = (struct ncsi_aen_lsc_pkt *)h;
+ old_data = ncm->data[2];
+ ncm->data[2] = ntohl(lsc->status);
+ ncm->data[4] = ntohl(lsc->oem_status);
+ if (!((old_data ^ ncm->data[2]) & 0x1) ||
+ !list_empty(&nc->link))
+ return 0;
+ if (!(nc->state == NCSI_CHANNEL_INACTIVE && (ncm->data[2] & 0x1)) &&
+ !(nc->state == NCSI_CHANNEL_ACTIVE && !(ncm->data[2] & 0x1)))
+ return 0;
+
+ if (!(ndp->flags & NCSI_DEV_HWA) &&
+ nc->state == NCSI_CHANNEL_ACTIVE)
+ ndp->flags |= NCSI_DEV_RESHUFFLE;
+
+ ncsi_stop_channel_monitor(nc);
+ spin_lock_irqsave(&ndp->lock, flags);
+ list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ return ncsi_process_next_channel(ndp);
+}
+
+static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp,
+ struct ncsi_aen_pkt_hdr *h)
+{
+ struct ncsi_channel *nc;
+ unsigned long flags;
+
+ /* Find the NCSI channel */
+ ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ if (!list_empty(&nc->link) ||
+ nc->state != NCSI_CHANNEL_ACTIVE)
+ return 0;
+
+ ncsi_stop_channel_monitor(nc);
+ spin_lock_irqsave(&ndp->lock, flags);
+ xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ return ncsi_process_next_channel(ndp);
+}
+
+static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp,
+ struct ncsi_aen_pkt_hdr *h)
+{
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+ struct ncsi_aen_hncdsc_pkt *hncdsc;
+ unsigned long flags;
+
+ /* Find the NCSI channel */
+ ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* If the channel is active one, we need reconfigure it */
+ ncm = &nc->modes[NCSI_MODE_LINK];
+ hncdsc = (struct ncsi_aen_hncdsc_pkt *)h;
+ ncm->data[3] = ntohl(hncdsc->status);
+ if (!list_empty(&nc->link) ||
+ nc->state != NCSI_CHANNEL_ACTIVE ||
+ (ncm->data[3] & 0x1))
+ return 0;
+
+ if (ndp->flags & NCSI_DEV_HWA)
+ ndp->flags |= NCSI_DEV_RESHUFFLE;
+
+ /* If this channel is the active one and the link doesn't
+ * work, we have to choose another channel to be active one.
+ * The logic here is exactly similar to what we do when link
+ * is down on the active channel.
+ */
+ ncsi_stop_channel_monitor(nc);
+ spin_lock_irqsave(&ndp->lock, flags);
+ list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ ncsi_process_next_channel(ndp);
+
+ return 0;
+}
+
+static struct ncsi_aen_handler {
+ unsigned char type;
+ int payload;
+ int (*handler)(struct ncsi_dev_priv *ndp,
+ struct ncsi_aen_pkt_hdr *h);
+} ncsi_aen_handlers[] = {
+ { NCSI_PKT_AEN_LSC, 12, ncsi_aen_handler_lsc },
+ { NCSI_PKT_AEN_CR, 4, ncsi_aen_handler_cr },
+ { NCSI_PKT_AEN_HNCDSC, 4, ncsi_aen_handler_hncdsc }
+};
+
+int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb)
+{
+ struct ncsi_aen_pkt_hdr *h;
+ struct ncsi_aen_handler *nah = NULL;
+ int i, ret;
+
+ /* Find the handler */
+ h = (struct ncsi_aen_pkt_hdr *)skb_network_header(skb);
+ for (i = 0; i < ARRAY_SIZE(ncsi_aen_handlers); i++) {
+ if (ncsi_aen_handlers[i].type == h->type) {
+ nah = &ncsi_aen_handlers[i];
+ break;
+ }
+ }
+
+ if (!nah) {
+ netdev_warn(ndp->ndev.dev, "Invalid AEN (0x%x) received\n",
+ h->type);
+ return -ENOENT;
+ }
+
+ ret = ncsi_validate_aen_pkt(h, nah->payload);
+ if (ret)
+ goto out;
+
+ ret = nah->handler(ndp, h);
+out:
+ consume_skb(skb);
+ return ret;
+}
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
new file mode 100644
index 000000000..21057a8ce
--- /dev/null
+++ b/net/ncsi/ncsi-cmd.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright Gavin Shan, IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/ncsi.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "internal.h"
+#include "ncsi-pkt.h"
+
+u32 ncsi_calculate_checksum(unsigned char *data, int len)
+{
+ u32 checksum = 0;
+ int i;
+
+ for (i = 0; i < len; i += 2)
+ checksum += (((u32)data[i] << 8) | data[i + 1]);
+
+ checksum = (~checksum + 1);
+ return checksum;
+}
+
+/* This function should be called after the data area has been
+ * populated completely.
+ */
+static void ncsi_cmd_build_header(struct ncsi_pkt_hdr *h,
+ struct ncsi_cmd_arg *nca)
+{
+ u32 checksum;
+ __be32 *pchecksum;
+
+ h->mc_id = 0;
+ h->revision = NCSI_PKT_REVISION;
+ h->reserved = 0;
+ h->id = nca->id;
+ h->type = nca->type;
+ h->channel = NCSI_TO_CHANNEL(nca->package,
+ nca->channel);
+ h->length = htons(nca->payload);
+ h->reserved1[0] = 0;
+ h->reserved1[1] = 0;
+
+ /* Fill with calculated checksum */
+ checksum = ncsi_calculate_checksum((unsigned char *)h,
+ sizeof(*h) + nca->payload);
+ pchecksum = (__be32 *)((void *)h + sizeof(struct ncsi_pkt_hdr) +
+ nca->payload);
+ *pchecksum = htonl(checksum);
+}
+
+static int ncsi_cmd_handler_default(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_sp(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_sp_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_sp_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->hw_arbitration = nca->bytes[0];
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_dc(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_dc_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_dc_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->ald = nca->bytes[0];
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_rc(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_rc_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_rc_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_ae(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_ae_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_ae_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->mc_id = nca->bytes[0];
+ cmd->mode = htonl(nca->dwords[1]);
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_sl(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_sl_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_sl_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->mode = htonl(nca->dwords[0]);
+ cmd->oem_mode = htonl(nca->dwords[1]);
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_svf(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_svf_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_svf_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->vlan = htons(nca->words[0]);
+ cmd->index = nca->bytes[2];
+ cmd->enable = nca->bytes[3];
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_ev(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_ev_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_ev_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->mode = nca->bytes[0];
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_sma(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_sma_pkt *cmd;
+ int i;
+
+ cmd = (struct ncsi_cmd_sma_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ for (i = 0; i < 6; i++)
+ cmd->mac[i] = nca->bytes[i];
+ cmd->index = nca->bytes[6];
+ cmd->at_e = nca->bytes[7];
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_ebf(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_ebf_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_ebf_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->mode = htonl(nca->dwords[0]);
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_egmf(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_egmf_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_egmf_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->mode = htonl(nca->dwords[0]);
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static int ncsi_cmd_handler_snfc(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_snfc_pkt *cmd;
+
+ cmd = (struct ncsi_cmd_snfc_pkt *)skb_put(skb, sizeof(*cmd));
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->mode = nca->bytes[0];
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
+static struct ncsi_cmd_handler {
+ unsigned char type;
+ int payload;
+ int (*handler)(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca);
+} ncsi_cmd_handlers[] = {
+ { NCSI_PKT_CMD_CIS, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_SP, 4, ncsi_cmd_handler_sp },
+ { NCSI_PKT_CMD_DP, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_EC, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_DC, 4, ncsi_cmd_handler_dc },
+ { NCSI_PKT_CMD_RC, 4, ncsi_cmd_handler_rc },
+ { NCSI_PKT_CMD_ECNT, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_DCNT, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_AE, 8, ncsi_cmd_handler_ae },
+ { NCSI_PKT_CMD_SL, 8, ncsi_cmd_handler_sl },
+ { NCSI_PKT_CMD_GLS, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_SVF, 4, ncsi_cmd_handler_svf },
+ { NCSI_PKT_CMD_EV, 4, ncsi_cmd_handler_ev },
+ { NCSI_PKT_CMD_DV, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_SMA, 8, ncsi_cmd_handler_sma },
+ { NCSI_PKT_CMD_EBF, 4, ncsi_cmd_handler_ebf },
+ { NCSI_PKT_CMD_DBF, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_EGMF, 4, ncsi_cmd_handler_egmf },
+ { NCSI_PKT_CMD_DGMF, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_SNFC, 4, ncsi_cmd_handler_snfc },
+ { NCSI_PKT_CMD_GVI, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_GC, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_GP, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_GCPS, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_GNS, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_GNPTS, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default },
+ { NCSI_PKT_CMD_OEM, 0, NULL },
+ { NCSI_PKT_CMD_PLDM, 0, NULL },
+ { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default }
+};
+
+static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_dev_priv *ndp = nca->ndp;
+ struct ncsi_dev *nd = &ndp->ndev;
+ struct net_device *dev = nd->dev;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ int len = hlen + tlen;
+ struct sk_buff *skb;
+ struct ncsi_request *nr;
+
+ nr = ncsi_alloc_request(ndp, nca->driven);
+ if (!nr)
+ return NULL;
+
+ /* NCSI command packet has 16-bytes header, payload, 4 bytes checksum.
+ * The packet needs padding if its payload is less than 26 bytes to
+ * meet 64 bytes minimal ethernet frame length.
+ */
+ len += sizeof(struct ncsi_cmd_pkt_hdr) + 4;
+ if (nca->payload < 26)
+ len += 26;
+ else
+ len += nca->payload;
+
+ /* Allocate skb */
+ skb = alloc_skb(len, GFP_ATOMIC);
+ if (!skb) {
+ ncsi_free_request(nr);
+ return NULL;
+ }
+
+ nr->cmd = skb;
+ skb_reserve(skb, hlen);
+ skb_reset_network_header(skb);
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_NCSI);
+
+ return nr;
+}
+
+int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_request *nr;
+ struct ethhdr *eh;
+ struct ncsi_cmd_handler *nch = NULL;
+ int i, ret;
+
+ /* Search for the handler */
+ for (i = 0; i < ARRAY_SIZE(ncsi_cmd_handlers); i++) {
+ if (ncsi_cmd_handlers[i].type == nca->type) {
+ if (ncsi_cmd_handlers[i].handler)
+ nch = &ncsi_cmd_handlers[i];
+ else
+ nch = NULL;
+
+ break;
+ }
+ }
+
+ if (!nch) {
+ netdev_err(nca->ndp->ndev.dev,
+ "Cannot send packet with type 0x%02x\n", nca->type);
+ return -ENOENT;
+ }
+
+ /* Get packet payload length and allocate the request */
+ nca->payload = nch->payload;
+ nr = ncsi_alloc_command(nca);
+ if (!nr)
+ return -ENOMEM;
+
+ /* Prepare the packet */
+ nca->id = nr->id;
+ ret = nch->handler(nr->cmd, nca);
+ if (ret) {
+ ncsi_free_request(nr);
+ return ret;
+ }
+
+ /* Fill the ethernet header */
+ eh = (struct ethhdr *)skb_push(nr->cmd, sizeof(*eh));
+ eh->h_proto = htons(ETH_P_NCSI);
+ eth_broadcast_addr(eh->h_dest);
+ eth_broadcast_addr(eh->h_source);
+
+ /* Start the timer for the request that might not have
+ * corresponding response. Given NCSI is an internal
+ * connection a 1 second delay should be sufficient.
+ */
+ nr->enabled = true;
+ mod_timer(&nr->timer, jiffies + 1 * HZ);
+
+ /* Send NCSI packet */
+ skb_get(nr->cmd);
+ ret = dev_queue_xmit(nr->cmd);
+ if (ret < 0) {
+ ncsi_free_request(nr);
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
new file mode 100644
index 000000000..ef017b871
--- /dev/null
+++ b/net/ncsi/ncsi-manage.c
@@ -0,0 +1,1205 @@
+/*
+ * Copyright Gavin Shan, IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+#include <net/ncsi.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/if_inet6.h>
+
+#include "internal.h"
+#include "ncsi-pkt.h"
+
+LIST_HEAD(ncsi_dev_list);
+DEFINE_SPINLOCK(ncsi_dev_lock);
+
+static inline int ncsi_filter_size(int table)
+{
+ int sizes[] = { 2, 6, 6, 6 };
+
+ BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX);
+ if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX)
+ return -EINVAL;
+
+ return sizes[table];
+}
+
+int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data)
+{
+ struct ncsi_channel_filter *ncf;
+ void *bitmap;
+ int index, size;
+ unsigned long flags;
+
+ ncf = nc->filters[table];
+ if (!ncf)
+ return -ENXIO;
+
+ size = ncsi_filter_size(table);
+ if (size < 0)
+ return size;
+
+ spin_lock_irqsave(&nc->lock, flags);
+ bitmap = (void *)&ncf->bitmap;
+ index = -1;
+ while ((index = find_next_bit(bitmap, ncf->total, index + 1))
+ < ncf->total) {
+ if (!memcmp(ncf->data + size * index, data, size)) {
+ spin_unlock_irqrestore(&nc->lock, flags);
+ return index;
+ }
+ }
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ return -ENOENT;
+}
+
+int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data)
+{
+ struct ncsi_channel_filter *ncf;
+ int index, size;
+ void *bitmap;
+ unsigned long flags;
+
+ size = ncsi_filter_size(table);
+ if (size < 0)
+ return size;
+
+ index = ncsi_find_filter(nc, table, data);
+ if (index >= 0)
+ return index;
+
+ ncf = nc->filters[table];
+ if (!ncf)
+ return -ENODEV;
+
+ spin_lock_irqsave(&nc->lock, flags);
+ bitmap = (void *)&ncf->bitmap;
+ do {
+ index = find_next_zero_bit(bitmap, ncf->total, 0);
+ if (index >= ncf->total) {
+ spin_unlock_irqrestore(&nc->lock, flags);
+ return -ENOSPC;
+ }
+ } while (test_and_set_bit(index, bitmap));
+
+ memcpy(ncf->data + size * index, data, size);
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ return index;
+}
+
+int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index)
+{
+ struct ncsi_channel_filter *ncf;
+ int size;
+ void *bitmap;
+ unsigned long flags;
+
+ size = ncsi_filter_size(table);
+ if (size < 0)
+ return size;
+
+ ncf = nc->filters[table];
+ if (!ncf || index >= ncf->total)
+ return -ENODEV;
+
+ spin_lock_irqsave(&nc->lock, flags);
+ bitmap = (void *)&ncf->bitmap;
+ if (test_and_clear_bit(index, bitmap))
+ memset(ncf->data + size * index, 0, size);
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ return 0;
+}
+
+static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
+{
+ struct ncsi_dev *nd = &ndp->ndev;
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+
+ nd->state = ncsi_dev_state_functional;
+ if (force_down) {
+ nd->link_up = 0;
+ goto report;
+ }
+
+ nd->link_up = 0;
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ if (!list_empty(&nc->link) ||
+ nc->state != NCSI_CHANNEL_ACTIVE)
+ continue;
+
+ if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
+ nd->link_up = 1;
+ goto report;
+ }
+ }
+ }
+
+report:
+ nd->handler(nd);
+}
+
+static void ncsi_channel_monitor(unsigned long data)
+{
+ struct ncsi_channel *nc = (struct ncsi_channel *)data;
+ struct ncsi_package *np = nc->package;
+ struct ncsi_dev_priv *ndp = np->ndp;
+ struct ncsi_cmd_arg nca;
+ bool enabled;
+ unsigned int timeout;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nc->lock, flags);
+ timeout = nc->timeout;
+ enabled = nc->enabled;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ if (!enabled || !list_empty(&nc->link))
+ return;
+ if (nc->state != NCSI_CHANNEL_INACTIVE &&
+ nc->state != NCSI_CHANNEL_ACTIVE)
+ return;
+
+ if (!(timeout % 2)) {
+ nca.ndp = ndp;
+ nca.package = np->id;
+ nca.channel = nc->id;
+ nca.type = NCSI_PKT_CMD_GLS;
+ nca.driven = false;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret) {
+ netdev_err(ndp->ndev.dev, "Error %d sending GLS\n",
+ ret);
+ return;
+ }
+ }
+
+ if (timeout + 1 >= 3) {
+ if (!(ndp->flags & NCSI_DEV_HWA) &&
+ nc->state == NCSI_CHANNEL_ACTIVE)
+ ncsi_report_link(ndp, true);
+
+ spin_lock_irqsave(&ndp->lock, flags);
+ xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ ncsi_process_next_channel(ndp);
+ return;
+ }
+
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->timeout = timeout + 1;
+ nc->enabled = true;
+ spin_unlock_irqrestore(&nc->lock, flags);
+ mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2)));
+}
+
+void ncsi_start_channel_monitor(struct ncsi_channel *nc)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&nc->lock, flags);
+ WARN_ON_ONCE(nc->enabled);
+ nc->timeout = 0;
+ nc->enabled = true;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2)));
+}
+
+void ncsi_stop_channel_monitor(struct ncsi_channel *nc)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&nc->lock, flags);
+ if (!nc->enabled) {
+ spin_unlock_irqrestore(&nc->lock, flags);
+ return;
+ }
+ nc->enabled = false;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ del_timer_sync(&nc->timer);
+}
+
+struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
+ unsigned char id)
+{
+ struct ncsi_channel *nc;
+
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ if (nc->id == id)
+ return nc;
+ }
+
+ return NULL;
+}
+
+struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
+{
+ struct ncsi_channel *nc, *tmp;
+ int index;
+ unsigned long flags;
+
+ nc = kzalloc(sizeof(*nc), GFP_ATOMIC);
+ if (!nc)
+ return NULL;
+
+ nc->id = id;
+ nc->package = np;
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ nc->enabled = false;
+ setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc);
+ spin_lock_init(&nc->lock);
+ INIT_LIST_HEAD(&nc->link);
+ for (index = 0; index < NCSI_CAP_MAX; index++)
+ nc->caps[index].index = index;
+ for (index = 0; index < NCSI_MODE_MAX; index++)
+ nc->modes[index].index = index;
+
+ spin_lock_irqsave(&np->lock, flags);
+ tmp = ncsi_find_channel(np, id);
+ if (tmp) {
+ spin_unlock_irqrestore(&np->lock, flags);
+ kfree(nc);
+ return tmp;
+ }
+
+ list_add_tail_rcu(&nc->node, &np->channels);
+ np->channel_num++;
+ spin_unlock_irqrestore(&np->lock, flags);
+
+ return nc;
+}
+
+static void ncsi_remove_channel(struct ncsi_channel *nc)
+{
+ struct ncsi_package *np = nc->package;
+ struct ncsi_channel_filter *ncf;
+ unsigned long flags;
+ int i;
+
+ /* Release filters */
+ spin_lock_irqsave(&nc->lock, flags);
+ for (i = 0; i < NCSI_FILTER_MAX; i++) {
+ ncf = nc->filters[i];
+ if (!ncf)
+ continue;
+
+ nc->filters[i] = NULL;
+ kfree(ncf);
+ }
+
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+ ncsi_stop_channel_monitor(nc);
+
+ /* Remove and free channel */
+ spin_lock_irqsave(&np->lock, flags);
+ list_del_rcu(&nc->node);
+ np->channel_num--;
+ spin_unlock_irqrestore(&np->lock, flags);
+
+ kfree(nc);
+}
+
+struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp,
+ unsigned char id)
+{
+ struct ncsi_package *np;
+
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ if (np->id == id)
+ return np;
+ }
+
+ return NULL;
+}
+
+struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
+ unsigned char id)
+{
+ struct ncsi_package *np, *tmp;
+ unsigned long flags;
+
+ np = kzalloc(sizeof(*np), GFP_ATOMIC);
+ if (!np)
+ return NULL;
+
+ np->id = id;
+ np->ndp = ndp;
+ spin_lock_init(&np->lock);
+ INIT_LIST_HEAD(&np->channels);
+
+ spin_lock_irqsave(&ndp->lock, flags);
+ tmp = ncsi_find_package(ndp, id);
+ if (tmp) {
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ kfree(np);
+ return tmp;
+ }
+
+ list_add_tail_rcu(&np->node, &ndp->packages);
+ ndp->package_num++;
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ return np;
+}
+
+void ncsi_remove_package(struct ncsi_package *np)
+{
+ struct ncsi_dev_priv *ndp = np->ndp;
+ struct ncsi_channel *nc, *tmp;
+ unsigned long flags;
+
+ /* Release all child channels */
+ list_for_each_entry_safe(nc, tmp, &np->channels, node)
+ ncsi_remove_channel(nc);
+
+ /* Remove and free package */
+ spin_lock_irqsave(&ndp->lock, flags);
+ list_del_rcu(&np->node);
+ ndp->package_num--;
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ kfree(np);
+}
+
+void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
+ unsigned char id,
+ struct ncsi_package **np,
+ struct ncsi_channel **nc)
+{
+ struct ncsi_package *p;
+ struct ncsi_channel *c;
+
+ p = ncsi_find_package(ndp, NCSI_PACKAGE_INDEX(id));
+ c = p ? ncsi_find_channel(p, NCSI_CHANNEL_INDEX(id)) : NULL;
+
+ if (np)
+ *np = p;
+ if (nc)
+ *nc = c;
+}
+
+/* For two consecutive NCSI commands, the packet IDs shouldn't
+ * be same. Otherwise, the bogus response might be replied. So
+ * the available IDs are allocated in round-robin fashion.
+ */
+struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven)
+{
+ struct ncsi_request *nr = NULL;
+ int i, limit = ARRAY_SIZE(ndp->requests);
+ unsigned long flags;
+
+ /* Check if there is one available request until the ceiling */
+ spin_lock_irqsave(&ndp->lock, flags);
+ for (i = ndp->request_id; !nr && i < limit; i++) {
+ if (ndp->requests[i].used)
+ continue;
+
+ nr = &ndp->requests[i];
+ nr->used = true;
+ nr->driven = driven;
+ if (++ndp->request_id >= limit)
+ ndp->request_id = 0;
+ }
+
+ /* Fail back to check from the starting cursor */
+ for (i = 0; !nr && i < ndp->request_id; i++) {
+ if (ndp->requests[i].used)
+ continue;
+
+ nr = &ndp->requests[i];
+ nr->used = true;
+ nr->driven = driven;
+ if (++ndp->request_id >= limit)
+ ndp->request_id = 0;
+ }
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ return nr;
+}
+
+void ncsi_free_request(struct ncsi_request *nr)
+{
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct sk_buff *cmd, *rsp;
+ unsigned long flags;
+ bool driven;
+
+ if (nr->enabled) {
+ nr->enabled = false;
+ del_timer_sync(&nr->timer);
+ }
+
+ spin_lock_irqsave(&ndp->lock, flags);
+ cmd = nr->cmd;
+ rsp = nr->rsp;
+ nr->cmd = NULL;
+ nr->rsp = NULL;
+ nr->used = false;
+ driven = nr->driven;
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ if (driven && cmd && --ndp->pending_req_num == 0)
+ schedule_work(&ndp->work);
+
+ /* Release command and response */
+ consume_skb(cmd);
+ consume_skb(rsp);
+}
+
+struct ncsi_dev *ncsi_find_dev(struct net_device *dev)
+{
+ struct ncsi_dev_priv *ndp;
+
+ NCSI_FOR_EACH_DEV(ndp) {
+ if (ndp->ndev.dev == dev)
+ return &ndp->ndev;
+ }
+
+ return NULL;
+}
+
+static void ncsi_request_timeout(unsigned long data)
+{
+ struct ncsi_request *nr = (struct ncsi_request *)data;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ unsigned long flags;
+
+ /* If the request already had associated response,
+ * let the response handler to release it.
+ */
+ spin_lock_irqsave(&ndp->lock, flags);
+ nr->enabled = false;
+ if (nr->rsp || !nr->cmd) {
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ return;
+ }
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ /* Release the request */
+ ncsi_free_request(nr);
+}
+
+static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
+{
+ struct ncsi_dev *nd = &ndp->ndev;
+ struct ncsi_package *np = ndp->active_package;
+ struct ncsi_channel *nc = ndp->active_channel;
+ struct ncsi_cmd_arg nca;
+ int ret;
+
+ nca.ndp = ndp;
+ nca.driven = true;
+ switch (nd->state) {
+ case ncsi_dev_state_suspend:
+ nd->state = ncsi_dev_state_suspend_select;
+ /* Fall through */
+ case ncsi_dev_state_suspend_select:
+ case ncsi_dev_state_suspend_dcnt:
+ case ncsi_dev_state_suspend_dc:
+ case ncsi_dev_state_suspend_deselect:
+ ndp->pending_req_num = 1;
+
+ np = ndp->active_package;
+ nc = ndp->active_channel;
+ nca.package = np->id;
+ if (nd->state == ncsi_dev_state_suspend_select) {
+ nca.type = NCSI_PKT_CMD_SP;
+ nca.channel = 0x1f;
+ if (ndp->flags & NCSI_DEV_HWA)
+ nca.bytes[0] = 0;
+ else
+ nca.bytes[0] = 1;
+ nd->state = ncsi_dev_state_suspend_dcnt;
+ } else if (nd->state == ncsi_dev_state_suspend_dcnt) {
+ nca.type = NCSI_PKT_CMD_DCNT;
+ nca.channel = nc->id;
+ nd->state = ncsi_dev_state_suspend_dc;
+ } else if (nd->state == ncsi_dev_state_suspend_dc) {
+ nca.type = NCSI_PKT_CMD_DC;
+ nca.channel = nc->id;
+ nca.bytes[0] = 1;
+ nd->state = ncsi_dev_state_suspend_deselect;
+ } else if (nd->state == ncsi_dev_state_suspend_deselect) {
+ nca.type = NCSI_PKT_CMD_DP;
+ nca.channel = 0x1f;
+ nd->state = ncsi_dev_state_suspend_done;
+ }
+
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret) {
+ nd->state = ncsi_dev_state_functional;
+ return;
+ }
+
+ break;
+ case ncsi_dev_state_suspend_done:
+ xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ ncsi_process_next_channel(ndp);
+
+ break;
+ default:
+ netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n",
+ nd->state);
+ }
+}
+
+static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
+{
+ struct ncsi_dev *nd = &ndp->ndev;
+ struct net_device *dev = nd->dev;
+ struct ncsi_package *np = ndp->active_package;
+ struct ncsi_channel *nc = ndp->active_channel;
+ struct ncsi_cmd_arg nca;
+ unsigned char index;
+ int ret;
+
+ nca.ndp = ndp;
+ nca.driven = true;
+ switch (nd->state) {
+ case ncsi_dev_state_config:
+ case ncsi_dev_state_config_sp:
+ ndp->pending_req_num = 1;
+
+ /* Select the specific package */
+ nca.type = NCSI_PKT_CMD_SP;
+ if (ndp->flags & NCSI_DEV_HWA)
+ nca.bytes[0] = 0;
+ else
+ nca.bytes[0] = 1;
+ nca.package = np->id;
+ nca.channel = 0x1f;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+
+ nd->state = ncsi_dev_state_config_cis;
+ break;
+ case ncsi_dev_state_config_cis:
+ ndp->pending_req_num = 1;
+
+ /* Clear initial state */
+ nca.type = NCSI_PKT_CMD_CIS;
+ nca.package = np->id;
+ nca.channel = nc->id;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+
+ nd->state = ncsi_dev_state_config_sma;
+ break;
+ case ncsi_dev_state_config_sma:
+ case ncsi_dev_state_config_ebf:
+#if IS_ENABLED(CONFIG_IPV6)
+ case ncsi_dev_state_config_egmf:
+#endif
+ case ncsi_dev_state_config_ecnt:
+ case ncsi_dev_state_config_ec:
+ case ncsi_dev_state_config_ae:
+ case ncsi_dev_state_config_gls:
+ ndp->pending_req_num = 1;
+
+ nca.package = np->id;
+ nca.channel = nc->id;
+
+ /* Use first entry in unicast filter table. Note that
+ * the MAC filter table starts from entry 1 instead of
+ * 0.
+ */
+ if (nd->state == ncsi_dev_state_config_sma) {
+ nca.type = NCSI_PKT_CMD_SMA;
+ for (index = 0; index < 6; index++)
+ nca.bytes[index] = dev->dev_addr[index];
+ nca.bytes[6] = 0x1;
+ nca.bytes[7] = 0x1;
+ nd->state = ncsi_dev_state_config_ebf;
+ } else if (nd->state == ncsi_dev_state_config_ebf) {
+ nca.type = NCSI_PKT_CMD_EBF;
+ nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
+ nd->state = ncsi_dev_state_config_ecnt;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (ndp->inet6_addr_num > 0 &&
+ (nc->caps[NCSI_CAP_GENERIC].cap &
+ NCSI_CAP_GENERIC_MC))
+ nd->state = ncsi_dev_state_config_egmf;
+ else
+ nd->state = ncsi_dev_state_config_ecnt;
+ } else if (nd->state == ncsi_dev_state_config_egmf) {
+ nca.type = NCSI_PKT_CMD_EGMF;
+ nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
+ nd->state = ncsi_dev_state_config_ecnt;
+#endif /* CONFIG_IPV6 */
+ } else if (nd->state == ncsi_dev_state_config_ecnt) {
+ nca.type = NCSI_PKT_CMD_ECNT;
+ nd->state = ncsi_dev_state_config_ec;
+ } else if (nd->state == ncsi_dev_state_config_ec) {
+ /* Enable AEN if it's supported */
+ nca.type = NCSI_PKT_CMD_EC;
+ nd->state = ncsi_dev_state_config_ae;
+ if (!(nc->caps[NCSI_CAP_AEN].cap & NCSI_CAP_AEN_MASK))
+ nd->state = ncsi_dev_state_config_gls;
+ } else if (nd->state == ncsi_dev_state_config_ae) {
+ nca.type = NCSI_PKT_CMD_AE;
+ nca.bytes[0] = 0;
+ nca.dwords[1] = nc->caps[NCSI_CAP_AEN].cap;
+ nd->state = ncsi_dev_state_config_gls;
+ } else if (nd->state == ncsi_dev_state_config_gls) {
+ nca.type = NCSI_PKT_CMD_GLS;
+ nd->state = ncsi_dev_state_config_done;
+ }
+
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+ break;
+ case ncsi_dev_state_config_done:
+ if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1)
+ xchg(&nc->state, NCSI_CHANNEL_ACTIVE);
+ else
+ xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+
+ ncsi_start_channel_monitor(nc);
+ ncsi_process_next_channel(ndp);
+ break;
+ default:
+ netdev_warn(dev, "Wrong NCSI state 0x%x in config\n",
+ nd->state);
+ }
+
+ return;
+
+error:
+ ncsi_report_link(ndp, true);
+}
+
+static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
+{
+ struct ncsi_package *np;
+ struct ncsi_channel *nc, *found;
+ struct ncsi_channel_mode *ncm;
+ unsigned long flags;
+
+ /* The search is done once an inactive channel with up
+ * link is found.
+ */
+ found = NULL;
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ if (!list_empty(&nc->link) ||
+ nc->state != NCSI_CHANNEL_INACTIVE)
+ continue;
+
+ if (!found)
+ found = nc;
+
+ ncm = &nc->modes[NCSI_MODE_LINK];
+ if (ncm->data[2] & 0x1) {
+ found = nc;
+ goto out;
+ }
+ }
+ }
+
+ if (!found) {
+ ncsi_report_link(ndp, true);
+ return -ENODEV;
+ }
+
+out:
+ spin_lock_irqsave(&ndp->lock, flags);
+ list_add_tail_rcu(&found->link, &ndp->channel_queue);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ return ncsi_process_next_channel(ndp);
+}
+
+static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp)
+{
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ unsigned int cap;
+
+ /* The hardware arbitration is disabled if any one channel
+ * doesn't support explicitly.
+ */
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ cap = nc->caps[NCSI_CAP_GENERIC].cap;
+ if (!(cap & NCSI_CAP_GENERIC_HWA) ||
+ (cap & NCSI_CAP_GENERIC_HWA_MASK) !=
+ NCSI_CAP_GENERIC_HWA_SUPPORT) {
+ ndp->flags &= ~NCSI_DEV_HWA;
+ return false;
+ }
+ }
+ }
+
+ ndp->flags |= NCSI_DEV_HWA;
+ return true;
+}
+
+static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp)
+{
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ unsigned long flags;
+
+ /* Move all available channels to processing queue */
+ spin_lock_irqsave(&ndp->lock, flags);
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE ||
+ !list_empty(&nc->link));
+ ncsi_stop_channel_monitor(nc);
+ list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+ }
+ }
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ /* We can have no channels in extremely case */
+ if (list_empty(&ndp->channel_queue)) {
+ ncsi_report_link(ndp, false);
+ return -ENOENT;
+ }
+
+ return ncsi_process_next_channel(ndp);
+}
+
+static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
+{
+ struct ncsi_dev *nd = &ndp->ndev;
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ struct ncsi_cmd_arg nca;
+ unsigned char index;
+ int ret;
+
+ nca.ndp = ndp;
+ nca.driven = true;
+ switch (nd->state) {
+ case ncsi_dev_state_probe:
+ nd->state = ncsi_dev_state_probe_deselect;
+ /* Fall through */
+ case ncsi_dev_state_probe_deselect:
+ ndp->pending_req_num = 8;
+
+ /* Deselect all possible packages */
+ nca.type = NCSI_PKT_CMD_DP;
+ nca.channel = 0x1f;
+ for (index = 0; index < 8; index++) {
+ nca.package = index;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+ }
+
+ nd->state = ncsi_dev_state_probe_package;
+ break;
+ case ncsi_dev_state_probe_package:
+ ndp->pending_req_num = 16;
+
+ /* Select all possible packages */
+ nca.type = NCSI_PKT_CMD_SP;
+ nca.bytes[0] = 1;
+ nca.channel = 0x1f;
+ for (index = 0; index < 8; index++) {
+ nca.package = index;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+ }
+
+ /* Disable all possible packages */
+ nca.type = NCSI_PKT_CMD_DP;
+ for (index = 0; index < 8; index++) {
+ nca.package = index;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+ }
+
+ nd->state = ncsi_dev_state_probe_channel;
+ break;
+ case ncsi_dev_state_probe_channel:
+ if (!ndp->active_package)
+ ndp->active_package = list_first_or_null_rcu(
+ &ndp->packages, struct ncsi_package, node);
+ else if (list_is_last(&ndp->active_package->node,
+ &ndp->packages))
+ ndp->active_package = NULL;
+ else
+ ndp->active_package = list_next_entry(
+ ndp->active_package, node);
+
+ /* All available packages and channels are enumerated. The
+ * enumeration happens for once when the NCSI interface is
+ * started. So we need continue to start the interface after
+ * the enumeration.
+ *
+ * We have to choose an active channel before configuring it.
+ * Note that we possibly don't have active channel in extreme
+ * situation.
+ */
+ if (!ndp->active_package) {
+ ndp->flags |= NCSI_DEV_PROBED;
+ if (ncsi_check_hwa(ndp))
+ ncsi_enable_hwa(ndp);
+ else
+ ncsi_choose_active_channel(ndp);
+ return;
+ }
+
+ /* Select the active package */
+ ndp->pending_req_num = 1;
+ nca.type = NCSI_PKT_CMD_SP;
+ nca.bytes[0] = 1;
+ nca.package = ndp->active_package->id;
+ nca.channel = 0x1f;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+
+ nd->state = ncsi_dev_state_probe_cis;
+ break;
+ case ncsi_dev_state_probe_cis:
+ ndp->pending_req_num = 32;
+
+ /* Clear initial state */
+ nca.type = NCSI_PKT_CMD_CIS;
+ nca.package = ndp->active_package->id;
+ for (index = 0; index < 0x20; index++) {
+ nca.channel = index;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+ }
+
+ nd->state = ncsi_dev_state_probe_gvi;
+ break;
+ case ncsi_dev_state_probe_gvi:
+ case ncsi_dev_state_probe_gc:
+ case ncsi_dev_state_probe_gls:
+ np = ndp->active_package;
+ ndp->pending_req_num = np->channel_num;
+
+ /* Retrieve version, capability or link status */
+ if (nd->state == ncsi_dev_state_probe_gvi)
+ nca.type = NCSI_PKT_CMD_GVI;
+ else if (nd->state == ncsi_dev_state_probe_gc)
+ nca.type = NCSI_PKT_CMD_GC;
+ else
+ nca.type = NCSI_PKT_CMD_GLS;
+
+ nca.package = np->id;
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ nca.channel = nc->id;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+ }
+
+ if (nd->state == ncsi_dev_state_probe_gvi)
+ nd->state = ncsi_dev_state_probe_gc;
+ else if (nd->state == ncsi_dev_state_probe_gc)
+ nd->state = ncsi_dev_state_probe_gls;
+ else
+ nd->state = ncsi_dev_state_probe_dp;
+ break;
+ case ncsi_dev_state_probe_dp:
+ ndp->pending_req_num = 1;
+
+ /* Deselect the active package */
+ nca.type = NCSI_PKT_CMD_DP;
+ nca.package = ndp->active_package->id;
+ nca.channel = 0x1f;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+
+ /* Scan channels in next package */
+ nd->state = ncsi_dev_state_probe_channel;
+ break;
+ default:
+ netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n",
+ nd->state);
+ }
+
+ return;
+error:
+ ncsi_report_link(ndp, true);
+}
+
+static void ncsi_dev_work(struct work_struct *work)
+{
+ struct ncsi_dev_priv *ndp = container_of(work,
+ struct ncsi_dev_priv, work);
+ struct ncsi_dev *nd = &ndp->ndev;
+
+ switch (nd->state & ncsi_dev_state_major) {
+ case ncsi_dev_state_probe:
+ ncsi_probe_channel(ndp);
+ break;
+ case ncsi_dev_state_suspend:
+ ncsi_suspend_channel(ndp);
+ break;
+ case ncsi_dev_state_config:
+ ncsi_configure_channel(ndp);
+ break;
+ default:
+ netdev_warn(nd->dev, "Wrong NCSI state 0x%x in workqueue\n",
+ nd->state);
+ }
+}
+
+int ncsi_process_next_channel(struct ncsi_dev_priv *ndp)
+{
+ struct ncsi_channel *nc;
+ int old_state;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ndp->lock, flags);
+ nc = list_first_or_null_rcu(&ndp->channel_queue,
+ struct ncsi_channel, link);
+ if (!nc) {
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ goto out;
+ }
+
+ old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE);
+ list_del_init(&nc->link);
+
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ ndp->active_channel = nc;
+ ndp->active_package = nc->package;
+
+ switch (old_state) {
+ case NCSI_CHANNEL_INACTIVE:
+ ndp->ndev.state = ncsi_dev_state_config;
+ ncsi_configure_channel(ndp);
+ break;
+ case NCSI_CHANNEL_ACTIVE:
+ ndp->ndev.state = ncsi_dev_state_suspend;
+ ncsi_suspend_channel(ndp);
+ break;
+ default:
+ netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n",
+ nc->state, nc->package->id, nc->id);
+ ncsi_report_link(ndp, false);
+ return -EINVAL;
+ }
+
+ return 0;
+
+out:
+ ndp->active_channel = NULL;
+ ndp->active_package = NULL;
+ if (ndp->flags & NCSI_DEV_RESHUFFLE) {
+ ndp->flags &= ~NCSI_DEV_RESHUFFLE;
+ return ncsi_choose_active_channel(ndp);
+ }
+
+ ncsi_report_link(ndp, false);
+ return -ENODEV;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int ncsi_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *data)
+{
+ struct inet6_ifaddr *ifa = data;
+ struct net_device *dev = ifa->idev->dev;
+ struct ncsi_dev *nd = ncsi_find_dev(dev);
+ struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL;
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ struct ncsi_cmd_arg nca;
+ bool action;
+ int ret;
+
+ if (!ndp || (ipv6_addr_type(&ifa->addr) &
+ (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK)))
+ return NOTIFY_OK;
+
+ switch (event) {
+ case NETDEV_UP:
+ action = (++ndp->inet6_addr_num) == 1;
+ nca.type = NCSI_PKT_CMD_EGMF;
+ break;
+ case NETDEV_DOWN:
+ action = (--ndp->inet6_addr_num == 0);
+ nca.type = NCSI_PKT_CMD_DGMF;
+ break;
+ default:
+ return NOTIFY_OK;
+ }
+
+ /* We might not have active channel or packages. The IPv6
+ * required multicast will be enabled when active channel
+ * or packages are chosen.
+ */
+ np = ndp->active_package;
+ nc = ndp->active_channel;
+ if (!action || !np || !nc)
+ return NOTIFY_OK;
+
+ /* We needn't enable or disable it if the function isn't supported */
+ if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC))
+ return NOTIFY_OK;
+
+ nca.ndp = ndp;
+ nca.driven = false;
+ nca.package = np->id;
+ nca.channel = nc->id;
+ nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret) {
+ netdev_warn(dev, "Fail to %s global multicast filter (%d)\n",
+ (event == NETDEV_UP) ? "enable" : "disable", ret);
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block ncsi_inet6addr_notifier = {
+ .notifier_call = ncsi_inet6addr_event,
+};
+#endif /* CONFIG_IPV6 */
+
+struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
+ void (*handler)(struct ncsi_dev *ndev))
+{
+ struct ncsi_dev_priv *ndp;
+ struct ncsi_dev *nd;
+ unsigned long flags;
+ int i;
+
+ /* Check if the device has been registered or not */
+ nd = ncsi_find_dev(dev);
+ if (nd)
+ return nd;
+
+ /* Create NCSI device */
+ ndp = kzalloc(sizeof(*ndp), GFP_ATOMIC);
+ if (!ndp)
+ return NULL;
+
+ nd = &ndp->ndev;
+ nd->state = ncsi_dev_state_registered;
+ nd->dev = dev;
+ nd->handler = handler;
+ ndp->pending_req_num = 0;
+ INIT_LIST_HEAD(&ndp->channel_queue);
+ INIT_WORK(&ndp->work, ncsi_dev_work);
+
+ /* Initialize private NCSI device */
+ spin_lock_init(&ndp->lock);
+ INIT_LIST_HEAD(&ndp->packages);
+ ndp->request_id = 0;
+ for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) {
+ ndp->requests[i].id = i;
+ ndp->requests[i].ndp = ndp;
+ setup_timer(&ndp->requests[i].timer,
+ ncsi_request_timeout,
+ (unsigned long)&ndp->requests[i]);
+ }
+
+ spin_lock_irqsave(&ncsi_dev_lock, flags);
+#if IS_ENABLED(CONFIG_IPV6)
+ ndp->inet6_addr_num = 0;
+ if (list_empty(&ncsi_dev_list))
+ register_inet6addr_notifier(&ncsi_inet6addr_notifier);
+#endif
+ list_add_tail_rcu(&ndp->node, &ncsi_dev_list);
+ spin_unlock_irqrestore(&ncsi_dev_lock, flags);
+
+ /* Register NCSI packet Rx handler */
+ ndp->ptype.type = cpu_to_be16(ETH_P_NCSI);
+ ndp->ptype.func = ncsi_rcv_rsp;
+ ndp->ptype.dev = dev;
+ dev_add_pack(&ndp->ptype);
+
+ return nd;
+}
+EXPORT_SYMBOL_GPL(ncsi_register_dev);
+
+int ncsi_start_dev(struct ncsi_dev *nd)
+{
+ struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ int old_state, ret;
+
+ if (nd->state != ncsi_dev_state_registered &&
+ nd->state != ncsi_dev_state_functional)
+ return -ENOTTY;
+
+ if (!(ndp->flags & NCSI_DEV_PROBED)) {
+ nd->state = ncsi_dev_state_probe;
+ schedule_work(&ndp->work);
+ return 0;
+ }
+
+ /* Reset channel's state and start over */
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ WARN_ON_ONCE(!list_empty(&nc->link) ||
+ old_state == NCSI_CHANNEL_INVISIBLE);
+ }
+ }
+
+ if (ndp->flags & NCSI_DEV_HWA)
+ ret = ncsi_enable_hwa(ndp);
+ else
+ ret = ncsi_choose_active_channel(ndp);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ncsi_start_dev);
+
+void ncsi_unregister_dev(struct ncsi_dev *nd)
+{
+ struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
+ struct ncsi_package *np, *tmp;
+ unsigned long flags;
+
+ dev_remove_pack(&ndp->ptype);
+
+ list_for_each_entry_safe(np, tmp, &ndp->packages, node)
+ ncsi_remove_package(np);
+
+ spin_lock_irqsave(&ncsi_dev_lock, flags);
+ list_del_rcu(&ndp->node);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (list_empty(&ncsi_dev_list))
+ unregister_inet6addr_notifier(&ncsi_inet6addr_notifier);
+#endif
+ spin_unlock_irqrestore(&ncsi_dev_lock, flags);
+
+ kfree(ndp);
+}
+EXPORT_SYMBOL_GPL(ncsi_unregister_dev);
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
new file mode 100644
index 000000000..3ea49ed0a
--- /dev/null
+++ b/net/ncsi/ncsi-pkt.h
@@ -0,0 +1,415 @@
+/*
+ * Copyright Gavin Shan, IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NCSI_PKT_H__
+#define __NCSI_PKT_H__
+
+struct ncsi_pkt_hdr {
+ unsigned char mc_id; /* Management controller ID */
+ unsigned char revision; /* NCSI version - 0x01 */
+ unsigned char reserved; /* Reserved */
+ unsigned char id; /* Packet sequence number */
+ unsigned char type; /* Packet type */
+ unsigned char channel; /* Network controller ID */
+ __be16 length; /* Payload length */
+ __be32 reserved1[2]; /* Reserved */
+};
+
+struct ncsi_cmd_pkt_hdr {
+ struct ncsi_pkt_hdr common; /* Common NCSI packet header */
+};
+
+struct ncsi_rsp_pkt_hdr {
+ struct ncsi_pkt_hdr common; /* Common NCSI packet header */
+ __be16 code; /* Response code */
+ __be16 reason; /* Response reason */
+};
+
+struct ncsi_aen_pkt_hdr {
+ struct ncsi_pkt_hdr common; /* Common NCSI packet header */
+ unsigned char reserved2[3]; /* Reserved */
+ unsigned char type; /* AEN packet type */
+};
+
+/* NCSI common command packet */
+struct ncsi_cmd_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[26];
+};
+
+struct ncsi_rsp_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* Select Package */
+struct ncsi_cmd_sp_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ unsigned char reserved[3]; /* Reserved */
+ unsigned char hw_arbitration; /* HW arbitration */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* Disable Channel */
+struct ncsi_cmd_dc_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ unsigned char reserved[3]; /* Reserved */
+ unsigned char ald; /* Allow link down */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* Reset Channel */
+struct ncsi_cmd_rc_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ __be32 reserved; /* Reserved */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* AEN Enable */
+struct ncsi_cmd_ae_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ unsigned char reserved[3]; /* Reserved */
+ unsigned char mc_id; /* MC ID */
+ __be32 mode; /* AEN working mode */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[18];
+};
+
+/* Set Link */
+struct ncsi_cmd_sl_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ __be32 mode; /* Link working mode */
+ __be32 oem_mode; /* OEM link mode */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[18];
+};
+
+/* Set VLAN Filter */
+struct ncsi_cmd_svf_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ __be16 reserved; /* Reserved */
+ __be16 vlan; /* VLAN ID */
+ __be16 reserved1; /* Reserved */
+ unsigned char index; /* VLAN table index */
+ unsigned char enable; /* Enable or disable */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[14];
+};
+
+/* Enable VLAN */
+struct ncsi_cmd_ev_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ unsigned char reserved[3]; /* Reserved */
+ unsigned char mode; /* VLAN filter mode */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* Set MAC Address */
+struct ncsi_cmd_sma_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ unsigned char mac[6]; /* MAC address */
+ unsigned char index; /* MAC table index */
+ unsigned char at_e; /* Addr type and operation */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[18];
+};
+
+/* Enable Broadcast Filter */
+struct ncsi_cmd_ebf_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ __be32 mode; /* Filter mode */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* Enable Global Multicast Filter */
+struct ncsi_cmd_egmf_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ __be32 mode; /* Global MC mode */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* Set NCSI Flow Control */
+struct ncsi_cmd_snfc_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ unsigned char reserved[3]; /* Reserved */
+ unsigned char mode; /* Flow control mode */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* Get Link Status */
+struct ncsi_rsp_gls_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ __be32 status; /* Link status */
+ __be32 other; /* Other indications */
+ __be32 oem_status; /* OEM link status */
+ __be32 checksum;
+ unsigned char pad[10];
+};
+
+/* Get Version ID */
+struct ncsi_rsp_gvi_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ __be32 ncsi_version; /* NCSI version */
+ unsigned char reserved[3]; /* Reserved */
+ unsigned char alpha2; /* NCSI version */
+ unsigned char fw_name[12]; /* f/w name string */
+ __be32 fw_version; /* f/w version */
+ __be16 pci_ids[4]; /* PCI IDs */
+ __be32 mf_id; /* Manufacture ID */
+ __be32 checksum;
+};
+
+/* Get Capabilities */
+struct ncsi_rsp_gc_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ __be32 cap; /* Capabilities */
+ __be32 bc_cap; /* Broadcast cap */
+ __be32 mc_cap; /* Multicast cap */
+ __be32 buf_cap; /* Buffering cap */
+ __be32 aen_cap; /* AEN cap */
+ unsigned char vlan_cnt; /* VLAN filter count */
+ unsigned char mixed_cnt; /* Mix filter count */
+ unsigned char mc_cnt; /* MC filter count */
+ unsigned char uc_cnt; /* UC filter count */
+ unsigned char reserved[2]; /* Reserved */
+ unsigned char vlan_mode; /* VLAN mode */
+ unsigned char channel_cnt; /* Channel count */
+ __be32 checksum; /* Checksum */
+};
+
+/* Get Parameters */
+struct ncsi_rsp_gp_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ unsigned char mac_cnt; /* Number of MAC addr */
+ unsigned char reserved[2]; /* Reserved */
+ unsigned char mac_enable; /* MAC addr enable flags */
+ unsigned char vlan_cnt; /* VLAN tag count */
+ unsigned char reserved1; /* Reserved */
+ __be16 vlan_enable; /* VLAN tag enable flags */
+ __be32 link_mode; /* Link setting */
+ __be32 bc_mode; /* BC filter mode */
+ __be32 valid_modes; /* Valid mode parameters */
+ unsigned char vlan_mode; /* VLAN mode */
+ unsigned char fc_mode; /* Flow control mode */
+ unsigned char reserved2[2]; /* Reserved */
+ __be32 aen_mode; /* AEN mode */
+ unsigned char mac[6]; /* Supported MAC addr */
+ __be16 vlan; /* Supported VLAN tags */
+ __be32 checksum; /* Checksum */
+};
+
+/* Get Controller Packet Statistics */
+struct ncsi_rsp_gcps_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ __be32 cnt_hi; /* Counter cleared */
+ __be32 cnt_lo; /* Counter cleared */
+ __be32 rx_bytes; /* Rx bytes */
+ __be32 tx_bytes; /* Tx bytes */
+ __be32 rx_uc_pkts; /* Rx UC packets */
+ __be32 rx_mc_pkts; /* Rx MC packets */
+ __be32 rx_bc_pkts; /* Rx BC packets */
+ __be32 tx_uc_pkts; /* Tx UC packets */
+ __be32 tx_mc_pkts; /* Tx MC packets */
+ __be32 tx_bc_pkts; /* Tx BC packets */
+ __be32 fcs_err; /* FCS errors */
+ __be32 align_err; /* Alignment errors */
+ __be32 false_carrier; /* False carrier detection */
+ __be32 runt_pkts; /* Rx runt packets */
+ __be32 jabber_pkts; /* Rx jabber packets */
+ __be32 rx_pause_xon; /* Rx pause XON frames */
+ __be32 rx_pause_xoff; /* Rx XOFF frames */
+ __be32 tx_pause_xon; /* Tx XON frames */
+ __be32 tx_pause_xoff; /* Tx XOFF frames */
+ __be32 tx_s_collision; /* Single collision frames */
+ __be32 tx_m_collision; /* Multiple collision frames */
+ __be32 l_collision; /* Late collision frames */
+ __be32 e_collision; /* Excessive collision frames */
+ __be32 rx_ctl_frames; /* Rx control frames */
+ __be32 rx_64_frames; /* Rx 64-bytes frames */
+ __be32 rx_127_frames; /* Rx 65-127 bytes frames */
+ __be32 rx_255_frames; /* Rx 128-255 bytes frames */
+ __be32 rx_511_frames; /* Rx 256-511 bytes frames */
+ __be32 rx_1023_frames; /* Rx 512-1023 bytes frames */
+ __be32 rx_1522_frames; /* Rx 1024-1522 bytes frames */
+ __be32 rx_9022_frames; /* Rx 1523-9022 bytes frames */
+ __be32 tx_64_frames; /* Tx 64-bytes frames */
+ __be32 tx_127_frames; /* Tx 65-127 bytes frames */
+ __be32 tx_255_frames; /* Tx 128-255 bytes frames */
+ __be32 tx_511_frames; /* Tx 256-511 bytes frames */
+ __be32 tx_1023_frames; /* Tx 512-1023 bytes frames */
+ __be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */
+ __be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */
+ __be32 rx_valid_bytes; /* Rx valid bytes */
+ __be32 rx_runt_pkts; /* Rx error runt packets */
+ __be32 rx_jabber_pkts; /* Rx error jabber packets */
+ __be32 checksum; /* Checksum */
+};
+
+/* Get NCSI Statistics */
+struct ncsi_rsp_gns_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ __be32 rx_cmds; /* Rx NCSI commands */
+ __be32 dropped_cmds; /* Dropped commands */
+ __be32 cmd_type_errs; /* Command type errors */
+ __be32 cmd_csum_errs; /* Command checksum errors */
+ __be32 rx_pkts; /* Rx NCSI packets */
+ __be32 tx_pkts; /* Tx NCSI packets */
+ __be32 tx_aen_pkts; /* Tx AEN packets */
+ __be32 checksum; /* Checksum */
+};
+
+/* Get NCSI Pass-through Statistics */
+struct ncsi_rsp_gnpts_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ __be32 tx_pkts; /* Tx packets */
+ __be32 tx_dropped; /* Tx dropped packets */
+ __be32 tx_channel_err; /* Tx channel errors */
+ __be32 tx_us_err; /* Tx undersize errors */
+ __be32 rx_pkts; /* Rx packets */
+ __be32 rx_dropped; /* Rx dropped packets */
+ __be32 rx_channel_err; /* Rx channel errors */
+ __be32 rx_us_err; /* Rx undersize errors */
+ __be32 rx_os_err; /* Rx oversize errors */
+ __be32 checksum; /* Checksum */
+};
+
+/* Get package status */
+struct ncsi_rsp_gps_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ __be32 status; /* Hardware arbitration status */
+ __be32 checksum;
+};
+
+/* Get package UUID */
+struct ncsi_rsp_gpuuid_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Response header */
+ unsigned char uuid[16]; /* UUID */
+ __be32 checksum;
+};
+
+/* AEN: Link State Change */
+struct ncsi_aen_lsc_pkt {
+ struct ncsi_aen_pkt_hdr aen; /* AEN header */
+ __be32 status; /* Link status */
+ __be32 oem_status; /* OEM link status */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[14];
+};
+
+/* AEN: Configuration Required */
+struct ncsi_aen_cr_pkt {
+ struct ncsi_aen_pkt_hdr aen; /* AEN header */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[22];
+};
+
+/* AEN: Host Network Controller Driver Status Change */
+struct ncsi_aen_hncdsc_pkt {
+ struct ncsi_aen_pkt_hdr aen; /* AEN header */
+ __be32 status; /* Status */
+ __be32 checksum; /* Checksum */
+ unsigned char pad[18];
+};
+
+/* NCSI packet revision */
+#define NCSI_PKT_REVISION 0x01
+
+/* NCSI packet commands */
+#define NCSI_PKT_CMD_CIS 0x00 /* Clear Initial State */
+#define NCSI_PKT_CMD_SP 0x01 /* Select Package */
+#define NCSI_PKT_CMD_DP 0x02 /* Deselect Package */
+#define NCSI_PKT_CMD_EC 0x03 /* Enable Channel */
+#define NCSI_PKT_CMD_DC 0x04 /* Disable Channel */
+#define NCSI_PKT_CMD_RC 0x05 /* Reset Channel */
+#define NCSI_PKT_CMD_ECNT 0x06 /* Enable Channel Network Tx */
+#define NCSI_PKT_CMD_DCNT 0x07 /* Disable Channel Network Tx */
+#define NCSI_PKT_CMD_AE 0x08 /* AEN Enable */
+#define NCSI_PKT_CMD_SL 0x09 /* Set Link */
+#define NCSI_PKT_CMD_GLS 0x0a /* Get Link */
+#define NCSI_PKT_CMD_SVF 0x0b /* Set VLAN Filter */
+#define NCSI_PKT_CMD_EV 0x0c /* Enable VLAN */
+#define NCSI_PKT_CMD_DV 0x0d /* Disable VLAN */
+#define NCSI_PKT_CMD_SMA 0x0e /* Set MAC address */
+#define NCSI_PKT_CMD_EBF 0x10 /* Enable Broadcast Filter */
+#define NCSI_PKT_CMD_DBF 0x11 /* Disable Broadcast Filter */
+#define NCSI_PKT_CMD_EGMF 0x12 /* Enable Global Multicast Filter */
+#define NCSI_PKT_CMD_DGMF 0x13 /* Disable Global Multicast Filter */
+#define NCSI_PKT_CMD_SNFC 0x14 /* Set NCSI Flow Control */
+#define NCSI_PKT_CMD_GVI 0x15 /* Get Version ID */
+#define NCSI_PKT_CMD_GC 0x16 /* Get Capabilities */
+#define NCSI_PKT_CMD_GP 0x17 /* Get Parameters */
+#define NCSI_PKT_CMD_GCPS 0x18 /* Get Controller Packet Statistics */
+#define NCSI_PKT_CMD_GNS 0x19 /* Get NCSI Statistics */
+#define NCSI_PKT_CMD_GNPTS 0x1a /* Get NCSI Pass-throu Statistics */
+#define NCSI_PKT_CMD_GPS 0x1b /* Get package status */
+#define NCSI_PKT_CMD_OEM 0x50 /* OEM */
+#define NCSI_PKT_CMD_PLDM 0x51 /* PLDM request over NCSI over RBT */
+#define NCSI_PKT_CMD_GPUUID 0x52 /* Get package UUID */
+
+/* NCSI packet responses */
+#define NCSI_PKT_RSP_CIS (NCSI_PKT_CMD_CIS + 0x80)
+#define NCSI_PKT_RSP_SP (NCSI_PKT_CMD_SP + 0x80)
+#define NCSI_PKT_RSP_DP (NCSI_PKT_CMD_DP + 0x80)
+#define NCSI_PKT_RSP_EC (NCSI_PKT_CMD_EC + 0x80)
+#define NCSI_PKT_RSP_DC (NCSI_PKT_CMD_DC + 0x80)
+#define NCSI_PKT_RSP_RC (NCSI_PKT_CMD_RC + 0x80)
+#define NCSI_PKT_RSP_ECNT (NCSI_PKT_CMD_ECNT + 0x80)
+#define NCSI_PKT_RSP_DCNT (NCSI_PKT_CMD_DCNT + 0x80)
+#define NCSI_PKT_RSP_AE (NCSI_PKT_CMD_AE + 0x80)
+#define NCSI_PKT_RSP_SL (NCSI_PKT_CMD_SL + 0x80)
+#define NCSI_PKT_RSP_GLS (NCSI_PKT_CMD_GLS + 0x80)
+#define NCSI_PKT_RSP_SVF (NCSI_PKT_CMD_SVF + 0x80)
+#define NCSI_PKT_RSP_EV (NCSI_PKT_CMD_EV + 0x80)
+#define NCSI_PKT_RSP_DV (NCSI_PKT_CMD_DV + 0x80)
+#define NCSI_PKT_RSP_SMA (NCSI_PKT_CMD_SMA + 0x80)
+#define NCSI_PKT_RSP_EBF (NCSI_PKT_CMD_EBF + 0x80)
+#define NCSI_PKT_RSP_DBF (NCSI_PKT_CMD_DBF + 0x80)
+#define NCSI_PKT_RSP_EGMF (NCSI_PKT_CMD_EGMF + 0x80)
+#define NCSI_PKT_RSP_DGMF (NCSI_PKT_CMD_DGMF + 0x80)
+#define NCSI_PKT_RSP_SNFC (NCSI_PKT_CMD_SNFC + 0x80)
+#define NCSI_PKT_RSP_GVI (NCSI_PKT_CMD_GVI + 0x80)
+#define NCSI_PKT_RSP_GC (NCSI_PKT_CMD_GC + 0x80)
+#define NCSI_PKT_RSP_GP (NCSI_PKT_CMD_GP + 0x80)
+#define NCSI_PKT_RSP_GCPS (NCSI_PKT_CMD_GCPS + 0x80)
+#define NCSI_PKT_RSP_GNS (NCSI_PKT_CMD_GNS + 0x80)
+#define NCSI_PKT_RSP_GNPTS (NCSI_PKT_CMD_GNPTS + 0x80)
+#define NCSI_PKT_RSP_GPS (NCSI_PKT_CMD_GPS + 0x80)
+#define NCSI_PKT_RSP_OEM (NCSI_PKT_CMD_OEM + 0x80)
+#define NCSI_PKT_RSP_PLDM (NCSI_PKT_CMD_PLDM + 0x80)
+#define NCSI_PKT_RSP_GPUUID (NCSI_PKT_CMD_GPUUID + 0x80)
+
+/* NCSI response code/reason */
+#define NCSI_PKT_RSP_C_COMPLETED 0x0000 /* Command Completed */
+#define NCSI_PKT_RSP_C_FAILED 0x0001 /* Command Failed */
+#define NCSI_PKT_RSP_C_UNAVAILABLE 0x0002 /* Command Unavailable */
+#define NCSI_PKT_RSP_C_UNSUPPORTED 0x0003 /* Command Unsupported */
+#define NCSI_PKT_RSP_R_NO_ERROR 0x0000 /* No Error */
+#define NCSI_PKT_RSP_R_INTERFACE 0x0001 /* Interface not ready */
+#define NCSI_PKT_RSP_R_PARAM 0x0002 /* Invalid Parameter */
+#define NCSI_PKT_RSP_R_CHANNEL 0x0003 /* Channel not Ready */
+#define NCSI_PKT_RSP_R_PACKAGE 0x0004 /* Package not Ready */
+#define NCSI_PKT_RSP_R_LENGTH 0x0005 /* Invalid payload length */
+#define NCSI_PKT_RSP_R_UNKNOWN 0x7fff /* Command type unsupported */
+
+/* NCSI AEN packet type */
+#define NCSI_PKT_AEN 0xFF /* AEN Packet */
+#define NCSI_PKT_AEN_LSC 0x00 /* Link status change */
+#define NCSI_PKT_AEN_CR 0x01 /* Configuration required */
+#define NCSI_PKT_AEN_HNCDSC 0x02 /* HNC driver status change */
+
+#endif /* __NCSI_PKT_H__ */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
new file mode 100644
index 000000000..af84389a6
--- /dev/null
+++ b/net/ncsi/ncsi-rsp.c
@@ -0,0 +1,1035 @@
+/*
+ * Copyright Gavin Shan, IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/ncsi.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "internal.h"
+#include "ncsi-pkt.h"
+
+static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
+ unsigned short payload)
+{
+ struct ncsi_rsp_pkt_hdr *h;
+ u32 checksum;
+ __be32 *pchecksum;
+
+ /* Check NCSI packet header. We don't need validate
+ * the packet type, which should have been checked
+ * before calling this function.
+ */
+ h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp);
+ if (h->common.revision != NCSI_PKT_REVISION)
+ return -EINVAL;
+ if (ntohs(h->common.length) != payload)
+ return -EINVAL;
+
+ /* Check on code and reason */
+ if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED ||
+ ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR)
+ return -EINVAL;
+
+ /* Validate checksum, which might be zeroes if the
+ * sender doesn't support checksum according to NCSI
+ * specification.
+ */
+ pchecksum = (__be32 *)((void *)(h + 1) + payload - 4);
+ if (ntohl(*pchecksum) == 0)
+ return 0;
+
+ checksum = ncsi_calculate_checksum((unsigned char *)h,
+ sizeof(*h) + payload - 4);
+ if (*pchecksum != htonl(checksum))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_cis(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ unsigned char id;
+
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, &np, &nc);
+ if (!nc) {
+ if (ndp->flags & NCSI_DEV_PROBED)
+ return -ENXIO;
+
+ id = NCSI_CHANNEL_INDEX(rsp->rsp.common.channel);
+ nc = ncsi_add_channel(np, id);
+ }
+
+ return nc ? 0 : -ENODEV;
+}
+
+static int ncsi_rsp_handler_sp(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_package *np;
+ unsigned char id;
+
+ /* Add the package if it's not existing. Otherwise,
+ * to change the state of its child channels.
+ */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ &np, NULL);
+ if (!np) {
+ if (ndp->flags & NCSI_DEV_PROBED)
+ return -ENXIO;
+
+ id = NCSI_PACKAGE_INDEX(rsp->rsp.common.channel);
+ np = ncsi_add_package(ndp, id);
+ if (!np)
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_dp(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ unsigned long flags;
+
+ /* Find the package */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ &np, NULL);
+ if (!np)
+ return -ENODEV;
+
+ /* Change state of all channels attached to the package */
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+ }
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_ec(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ ncm = &nc->modes[NCSI_MODE_ENABLE];
+ if (ncm->enable)
+ return -EBUSY;
+
+ ncm->enable = 1;
+ return 0;
+}
+
+static int ncsi_rsp_handler_dc(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+ int ret;
+
+ ret = ncsi_validate_rsp_pkt(nr, 4);
+ if (ret)
+ return ret;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ ncm = &nc->modes[NCSI_MODE_ENABLE];
+ if (!ncm->enable)
+ return -EBUSY;
+
+ ncm->enable = 0;
+ return 0;
+}
+
+static int ncsi_rsp_handler_rc(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ unsigned long flags;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Update state for the specified channel */
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_ecnt(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ ncm = &nc->modes[NCSI_MODE_TX_ENABLE];
+ if (ncm->enable)
+ return -EBUSY;
+
+ ncm->enable = 1;
+ return 0;
+}
+
+static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ ncm = &nc->modes[NCSI_MODE_TX_ENABLE];
+ if (!ncm->enable)
+ return -EBUSY;
+
+ ncm->enable = 1;
+ return 0;
+}
+
+static int ncsi_rsp_handler_ae(struct ncsi_request *nr)
+{
+ struct ncsi_cmd_ae_pkt *cmd;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Check if the AEN has been enabled */
+ ncm = &nc->modes[NCSI_MODE_AEN];
+ if (ncm->enable)
+ return -EBUSY;
+
+ /* Update to AEN configuration */
+ cmd = (struct ncsi_cmd_ae_pkt *)skb_network_header(nr->cmd);
+ ncm->enable = 1;
+ ncm->data[0] = cmd->mc_id;
+ ncm->data[1] = ntohl(cmd->mode);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_sl(struct ncsi_request *nr)
+{
+ struct ncsi_cmd_sl_pkt *cmd;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ cmd = (struct ncsi_cmd_sl_pkt *)skb_network_header(nr->cmd);
+ ncm = &nc->modes[NCSI_MODE_LINK];
+ ncm->data[0] = ntohl(cmd->mode);
+ ncm->data[1] = ntohl(cmd->oem_mode);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gls(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gls_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+ unsigned long flags;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_gls_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ ncm = &nc->modes[NCSI_MODE_LINK];
+ ncm->data[2] = ntohl(rsp->status);
+ ncm->data[3] = ntohl(rsp->other);
+ ncm->data[4] = ntohl(rsp->oem_status);
+
+ if (nr->driven)
+ return 0;
+
+ /* Reset the channel monitor if it has been enabled */
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->timeout = 0;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
+{
+ struct ncsi_cmd_svf_pkt *cmd;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_filter *ncf;
+ unsigned short vlan;
+ int ret;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd);
+ ncf = nc->filters[NCSI_FILTER_VLAN];
+ if (!ncf)
+ return -ENOENT;
+ if (cmd->index >= ncf->total)
+ return -ERANGE;
+
+ /* Add or remove the VLAN filter */
+ if (!(cmd->enable & 0x1)) {
+ ret = ncsi_remove_filter(nc, NCSI_FILTER_VLAN, cmd->index);
+ } else {
+ vlan = ntohs(cmd->vlan);
+ ret = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan);
+ }
+
+ return ret;
+}
+
+static int ncsi_rsp_handler_ev(struct ncsi_request *nr)
+{
+ struct ncsi_cmd_ev_pkt *cmd;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Check if VLAN mode has been enabled */
+ ncm = &nc->modes[NCSI_MODE_VLAN];
+ if (ncm->enable)
+ return -EBUSY;
+
+ /* Update to VLAN mode */
+ cmd = (struct ncsi_cmd_ev_pkt *)skb_network_header(nr->cmd);
+ ncm->enable = 1;
+ ncm->data[0] = ntohl(cmd->mode);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_dv(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Check if VLAN mode has been enabled */
+ ncm = &nc->modes[NCSI_MODE_VLAN];
+ if (!ncm->enable)
+ return -EBUSY;
+
+ /* Update to VLAN mode */
+ ncm->enable = 0;
+ return 0;
+}
+
+static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
+{
+ struct ncsi_cmd_sma_pkt *cmd;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_filter *ncf;
+ void *bitmap;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* According to NCSI spec 1.01, the mixed filter table
+ * isn't supported yet.
+ */
+ cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd);
+ switch (cmd->at_e >> 5) {
+ case 0x0: /* UC address */
+ ncf = nc->filters[NCSI_FILTER_UC];
+ break;
+ case 0x1: /* MC address */
+ ncf = nc->filters[NCSI_FILTER_MC];
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Sanity check on the filter */
+ if (!ncf)
+ return -ENOENT;
+ else if (cmd->index >= ncf->total)
+ return -ERANGE;
+
+ bitmap = &ncf->bitmap;
+ if (cmd->at_e & 0x1) {
+ if (test_and_set_bit(cmd->index, bitmap))
+ return -EBUSY;
+ memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6);
+ } else {
+ if (!test_and_clear_bit(cmd->index, bitmap))
+ return -EBUSY;
+
+ memset(ncf->data + 6 * cmd->index, 0, 6);
+ }
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_ebf(struct ncsi_request *nr)
+{
+ struct ncsi_cmd_ebf_pkt *cmd;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the package and channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Check if broadcast filter has been enabled */
+ ncm = &nc->modes[NCSI_MODE_BC];
+ if (ncm->enable)
+ return -EBUSY;
+
+ /* Update to broadcast filter mode */
+ cmd = (struct ncsi_cmd_ebf_pkt *)skb_network_header(nr->cmd);
+ ncm->enable = 1;
+ ncm->data[0] = ntohl(cmd->mode);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_dbf(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Check if broadcast filter isn't enabled */
+ ncm = &nc->modes[NCSI_MODE_BC];
+ if (!ncm->enable)
+ return -EBUSY;
+
+ /* Update to broadcast filter mode */
+ ncm->enable = 0;
+ ncm->data[0] = 0;
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_egmf(struct ncsi_request *nr)
+{
+ struct ncsi_cmd_egmf_pkt *cmd;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Check if multicast filter has been enabled */
+ ncm = &nc->modes[NCSI_MODE_MC];
+ if (ncm->enable)
+ return -EBUSY;
+
+ /* Update to multicast filter mode */
+ cmd = (struct ncsi_cmd_egmf_pkt *)skb_network_header(nr->cmd);
+ ncm->enable = 1;
+ ncm->data[0] = ntohl(cmd->mode);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_dgmf(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Check if multicast filter has been enabled */
+ ncm = &nc->modes[NCSI_MODE_MC];
+ if (!ncm->enable)
+ return -EBUSY;
+
+ /* Update to multicast filter mode */
+ ncm->enable = 0;
+ ncm->data[0] = 0;
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_snfc(struct ncsi_request *nr)
+{
+ struct ncsi_cmd_snfc_pkt *cmd;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_mode *ncm;
+
+ /* Find the channel */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Check if flow control has been enabled */
+ ncm = &nc->modes[NCSI_MODE_FC];
+ if (ncm->enable)
+ return -EBUSY;
+
+ /* Update to flow control mode */
+ cmd = (struct ncsi_cmd_snfc_pkt *)skb_network_header(nr->cmd);
+ ncm->enable = 1;
+ ncm->data[0] = cmd->mode;
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gvi(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gvi_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_version *ncv;
+ int i;
+
+ /* Find the channel */
+ rsp = (struct ncsi_rsp_gvi_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Update to channel's version info */
+ ncv = &nc->version;
+ ncv->version = ntohl(rsp->ncsi_version);
+ ncv->alpha2 = rsp->alpha2;
+ memcpy(ncv->fw_name, rsp->fw_name, 12);
+ ncv->fw_version = ntohl(rsp->fw_version);
+ for (i = 0; i < ARRAY_SIZE(ncv->pci_ids); i++)
+ ncv->pci_ids[i] = ntohs(rsp->pci_ids[i]);
+ ncv->mf_id = ntohl(rsp->mf_id);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gc_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_filter *ncf;
+ size_t size, entry_size;
+ int cnt, i;
+
+ /* Find the channel */
+ rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Update channel's capabilities */
+ nc->caps[NCSI_CAP_GENERIC].cap = ntohl(rsp->cap) &
+ NCSI_CAP_GENERIC_MASK;
+ nc->caps[NCSI_CAP_BC].cap = ntohl(rsp->bc_cap) &
+ NCSI_CAP_BC_MASK;
+ nc->caps[NCSI_CAP_MC].cap = ntohl(rsp->mc_cap) &
+ NCSI_CAP_MC_MASK;
+ nc->caps[NCSI_CAP_BUFFER].cap = ntohl(rsp->buf_cap);
+ nc->caps[NCSI_CAP_AEN].cap = ntohl(rsp->aen_cap) &
+ NCSI_CAP_AEN_MASK;
+ nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode &
+ NCSI_CAP_VLAN_MASK;
+
+ /* Build filters */
+ for (i = 0; i < NCSI_FILTER_MAX; i++) {
+ switch (i) {
+ case NCSI_FILTER_VLAN:
+ cnt = rsp->vlan_cnt;
+ entry_size = 2;
+ break;
+ case NCSI_FILTER_MIXED:
+ cnt = rsp->mixed_cnt;
+ entry_size = 6;
+ break;
+ case NCSI_FILTER_MC:
+ cnt = rsp->mc_cnt;
+ entry_size = 6;
+ break;
+ case NCSI_FILTER_UC:
+ cnt = rsp->uc_cnt;
+ entry_size = 6;
+ break;
+ default:
+ continue;
+ }
+
+ if (!cnt || nc->filters[i])
+ continue;
+
+ size = sizeof(*ncf) + cnt * entry_size;
+ ncf = kzalloc(size, GFP_ATOMIC);
+ if (!ncf) {
+ pr_warn("%s: Cannot alloc filter table (%d)\n",
+ __func__, i);
+ return -ENOMEM;
+ }
+
+ ncf->index = i;
+ ncf->total = cnt;
+ ncf->bitmap = 0x0ul;
+ nc->filters[i] = ncf;
+ }
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gp_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ unsigned short enable, vlan;
+ unsigned char *pdata;
+ int table, i;
+
+ /* Find the channel */
+ rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Modes with explicit enabled indications */
+ if (ntohl(rsp->valid_modes) & 0x1) { /* BC filter mode */
+ nc->modes[NCSI_MODE_BC].enable = 1;
+ nc->modes[NCSI_MODE_BC].data[0] = ntohl(rsp->bc_mode);
+ }
+ if (ntohl(rsp->valid_modes) & 0x2) /* Channel enabled */
+ nc->modes[NCSI_MODE_ENABLE].enable = 1;
+ if (ntohl(rsp->valid_modes) & 0x4) /* Channel Tx enabled */
+ nc->modes[NCSI_MODE_TX_ENABLE].enable = 1;
+ if (ntohl(rsp->valid_modes) & 0x8) /* MC filter mode */
+ nc->modes[NCSI_MODE_MC].enable = 1;
+
+ /* Modes without explicit enabled indications */
+ nc->modes[NCSI_MODE_LINK].enable = 1;
+ nc->modes[NCSI_MODE_LINK].data[0] = ntohl(rsp->link_mode);
+ nc->modes[NCSI_MODE_VLAN].enable = 1;
+ nc->modes[NCSI_MODE_VLAN].data[0] = rsp->vlan_mode;
+ nc->modes[NCSI_MODE_FC].enable = 1;
+ nc->modes[NCSI_MODE_FC].data[0] = rsp->fc_mode;
+ nc->modes[NCSI_MODE_AEN].enable = 1;
+ nc->modes[NCSI_MODE_AEN].data[0] = ntohl(rsp->aen_mode);
+
+ /* MAC addresses filter table */
+ pdata = (unsigned char *)rsp + 48;
+ enable = rsp->mac_enable;
+ for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) {
+ if (i >= (nc->filters[NCSI_FILTER_UC]->total +
+ nc->filters[NCSI_FILTER_MC]->total))
+ table = NCSI_FILTER_MIXED;
+ else if (i >= nc->filters[NCSI_FILTER_UC]->total)
+ table = NCSI_FILTER_MC;
+ else
+ table = NCSI_FILTER_UC;
+
+ if (!(enable & (0x1 << i)))
+ continue;
+
+ if (ncsi_find_filter(nc, table, pdata) >= 0)
+ continue;
+
+ ncsi_add_filter(nc, table, pdata);
+ }
+
+ /* VLAN filter table */
+ enable = ntohs(rsp->vlan_enable);
+ for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) {
+ if (!(enable & (0x1 << i)))
+ continue;
+
+ vlan = ntohs(*(__be16 *)pdata);
+ if (ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan) >= 0)
+ continue;
+
+ ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan);
+ }
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gcps(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gcps_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_stats *ncs;
+
+ /* Find the channel */
+ rsp = (struct ncsi_rsp_gcps_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Update HNC's statistics */
+ ncs = &nc->stats;
+ ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi);
+ ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo);
+ ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes);
+ ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes);
+ ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts);
+ ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts);
+ ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts);
+ ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts);
+ ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts);
+ ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts);
+ ncs->hnc_fcs_err = ntohl(rsp->fcs_err);
+ ncs->hnc_align_err = ntohl(rsp->align_err);
+ ncs->hnc_false_carrier = ntohl(rsp->false_carrier);
+ ncs->hnc_runt_pkts = ntohl(rsp->runt_pkts);
+ ncs->hnc_jabber_pkts = ntohl(rsp->jabber_pkts);
+ ncs->hnc_rx_pause_xon = ntohl(rsp->rx_pause_xon);
+ ncs->hnc_rx_pause_xoff = ntohl(rsp->rx_pause_xoff);
+ ncs->hnc_tx_pause_xon = ntohl(rsp->tx_pause_xon);
+ ncs->hnc_tx_pause_xoff = ntohl(rsp->tx_pause_xoff);
+ ncs->hnc_tx_s_collision = ntohl(rsp->tx_s_collision);
+ ncs->hnc_tx_m_collision = ntohl(rsp->tx_m_collision);
+ ncs->hnc_l_collision = ntohl(rsp->l_collision);
+ ncs->hnc_e_collision = ntohl(rsp->e_collision);
+ ncs->hnc_rx_ctl_frames = ntohl(rsp->rx_ctl_frames);
+ ncs->hnc_rx_64_frames = ntohl(rsp->rx_64_frames);
+ ncs->hnc_rx_127_frames = ntohl(rsp->rx_127_frames);
+ ncs->hnc_rx_255_frames = ntohl(rsp->rx_255_frames);
+ ncs->hnc_rx_511_frames = ntohl(rsp->rx_511_frames);
+ ncs->hnc_rx_1023_frames = ntohl(rsp->rx_1023_frames);
+ ncs->hnc_rx_1522_frames = ntohl(rsp->rx_1522_frames);
+ ncs->hnc_rx_9022_frames = ntohl(rsp->rx_9022_frames);
+ ncs->hnc_tx_64_frames = ntohl(rsp->tx_64_frames);
+ ncs->hnc_tx_127_frames = ntohl(rsp->tx_127_frames);
+ ncs->hnc_tx_255_frames = ntohl(rsp->tx_255_frames);
+ ncs->hnc_tx_511_frames = ntohl(rsp->tx_511_frames);
+ ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames);
+ ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames);
+ ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames);
+ ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes);
+ ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts);
+ ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gns(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gns_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_stats *ncs;
+
+ /* Find the channel */
+ rsp = (struct ncsi_rsp_gns_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Update HNC's statistics */
+ ncs = &nc->stats;
+ ncs->ncsi_rx_cmds = ntohl(rsp->rx_cmds);
+ ncs->ncsi_dropped_cmds = ntohl(rsp->dropped_cmds);
+ ncs->ncsi_cmd_type_errs = ntohl(rsp->cmd_type_errs);
+ ncs->ncsi_cmd_csum_errs = ntohl(rsp->cmd_csum_errs);
+ ncs->ncsi_rx_pkts = ntohl(rsp->rx_pkts);
+ ncs->ncsi_tx_pkts = ntohl(rsp->tx_pkts);
+ ncs->ncsi_tx_aen_pkts = ntohl(rsp->tx_aen_pkts);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gnpts(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gnpts_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_channel *nc;
+ struct ncsi_channel_stats *ncs;
+
+ /* Find the channel */
+ rsp = (struct ncsi_rsp_gnpts_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ NULL, &nc);
+ if (!nc)
+ return -ENODEV;
+
+ /* Update HNC's statistics */
+ ncs = &nc->stats;
+ ncs->pt_tx_pkts = ntohl(rsp->tx_pkts);
+ ncs->pt_tx_dropped = ntohl(rsp->tx_dropped);
+ ncs->pt_tx_channel_err = ntohl(rsp->tx_channel_err);
+ ncs->pt_tx_us_err = ntohl(rsp->tx_us_err);
+ ncs->pt_rx_pkts = ntohl(rsp->rx_pkts);
+ ncs->pt_rx_dropped = ntohl(rsp->rx_dropped);
+ ncs->pt_rx_channel_err = ntohl(rsp->rx_channel_err);
+ ncs->pt_rx_us_err = ntohl(rsp->rx_us_err);
+ ncs->pt_rx_os_err = ntohl(rsp->rx_os_err);
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gps(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gps_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_package *np;
+
+ /* Find the package */
+ rsp = (struct ncsi_rsp_gps_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ &np, NULL);
+ if (!np)
+ return -ENODEV;
+
+ return 0;
+}
+
+static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_gpuuid_pkt *rsp;
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_package *np;
+
+ /* Find the package */
+ rsp = (struct ncsi_rsp_gpuuid_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ &np, NULL);
+ if (!np)
+ return -ENODEV;
+
+ memcpy(np->uuid, rsp->uuid, sizeof(rsp->uuid));
+
+ return 0;
+}
+
+static struct ncsi_rsp_handler {
+ unsigned char type;
+ int payload;
+ int (*handler)(struct ncsi_request *nr);
+} ncsi_rsp_handlers[] = {
+ { NCSI_PKT_RSP_CIS, 4, ncsi_rsp_handler_cis },
+ { NCSI_PKT_RSP_SP, 4, ncsi_rsp_handler_sp },
+ { NCSI_PKT_RSP_DP, 4, ncsi_rsp_handler_dp },
+ { NCSI_PKT_RSP_EC, 4, ncsi_rsp_handler_ec },
+ { NCSI_PKT_RSP_DC, 4, ncsi_rsp_handler_dc },
+ { NCSI_PKT_RSP_RC, 4, ncsi_rsp_handler_rc },
+ { NCSI_PKT_RSP_ECNT, 4, ncsi_rsp_handler_ecnt },
+ { NCSI_PKT_RSP_DCNT, 4, ncsi_rsp_handler_dcnt },
+ { NCSI_PKT_RSP_AE, 4, ncsi_rsp_handler_ae },
+ { NCSI_PKT_RSP_SL, 4, ncsi_rsp_handler_sl },
+ { NCSI_PKT_RSP_GLS, 16, ncsi_rsp_handler_gls },
+ { NCSI_PKT_RSP_SVF, 4, ncsi_rsp_handler_svf },
+ { NCSI_PKT_RSP_EV, 4, ncsi_rsp_handler_ev },
+ { NCSI_PKT_RSP_DV, 4, ncsi_rsp_handler_dv },
+ { NCSI_PKT_RSP_SMA, 4, ncsi_rsp_handler_sma },
+ { NCSI_PKT_RSP_EBF, 4, ncsi_rsp_handler_ebf },
+ { NCSI_PKT_RSP_DBF, 4, ncsi_rsp_handler_dbf },
+ { NCSI_PKT_RSP_EGMF, 4, ncsi_rsp_handler_egmf },
+ { NCSI_PKT_RSP_DGMF, 4, ncsi_rsp_handler_dgmf },
+ { NCSI_PKT_RSP_SNFC, 4, ncsi_rsp_handler_snfc },
+ { NCSI_PKT_RSP_GVI, 36, ncsi_rsp_handler_gvi },
+ { NCSI_PKT_RSP_GC, 32, ncsi_rsp_handler_gc },
+ { NCSI_PKT_RSP_GP, -1, ncsi_rsp_handler_gp },
+ { NCSI_PKT_RSP_GCPS, 172, ncsi_rsp_handler_gcps },
+ { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns },
+ { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts },
+ { NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps },
+ { NCSI_PKT_RSP_OEM, 0, NULL },
+ { NCSI_PKT_RSP_PLDM, 0, NULL },
+ { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid }
+};
+
+int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct ncsi_rsp_handler *nrh = NULL;
+ struct ncsi_dev *nd;
+ struct ncsi_dev_priv *ndp;
+ struct ncsi_request *nr;
+ struct ncsi_pkt_hdr *hdr;
+ unsigned long flags;
+ int payload, i, ret;
+
+ /* Find the NCSI device */
+ nd = ncsi_find_dev(dev);
+ ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL;
+ if (!ndp)
+ return -ENODEV;
+
+ /* Check if it is AEN packet */
+ hdr = (struct ncsi_pkt_hdr *)skb_network_header(skb);
+ if (hdr->type == NCSI_PKT_AEN)
+ return ncsi_aen_handler(ndp, skb);
+
+ /* Find the handler */
+ for (i = 0; i < ARRAY_SIZE(ncsi_rsp_handlers); i++) {
+ if (ncsi_rsp_handlers[i].type == hdr->type) {
+ if (ncsi_rsp_handlers[i].handler)
+ nrh = &ncsi_rsp_handlers[i];
+ else
+ nrh = NULL;
+
+ break;
+ }
+ }
+
+ if (!nrh) {
+ netdev_err(nd->dev, "Received unrecognized packet (0x%x)\n",
+ hdr->type);
+ return -ENOENT;
+ }
+
+ /* Associate with the request */
+ spin_lock_irqsave(&ndp->lock, flags);
+ nr = &ndp->requests[hdr->id];
+ if (!nr->used) {
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ return -ENODEV;
+ }
+
+ nr->rsp = skb;
+ if (!nr->enabled) {
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ /* Validate the packet */
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ payload = nrh->payload;
+ if (payload < 0)
+ payload = ntohs(hdr->length);
+ ret = ncsi_validate_rsp_pkt(nr, payload);
+ if (ret)
+ goto out;
+
+ /* Process the packet */
+ ret = nrh->handler(nr);
+out:
+ ncsi_free_request(nr);
+ return ret;
+}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 95e757c37..9266ceebd 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -609,9 +609,8 @@ config NETFILTER_XT_MARK
The target allows you to create rules in the "mangle" table which alter
the netfilter mark (nfmark) field associated with the packet.
- Prior to routing, the nfmark can influence the routing method (see
- "Use netfilter MARK value as routing key") and can also be used by
- other subsystems to change their behavior.
+ Prior to routing, the nfmark can influence the routing method and can
+ also be used by other subsystems to change their behavior.
config NETFILTER_XT_CONNMARK
tristate 'ctmark target and match support'
@@ -753,9 +752,8 @@ config NETFILTER_XT_TARGET_HMARK
The target allows you to create rules in the "raw" and "mangle" tables
which set the skbuff mark by means of hash calculation within a given
- range. The nfmark can influence the routing method (see "Use netfilter
- MARK value as routing key") and can also be used by other subsystems to
- change their behaviour.
+ range. The nfmark can influence the routing method and can also be used
+ by other subsystems to change their behaviour.
To compile it as a module, choose M here. If unsure, say N.
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index d7024b2ed..5117bcb7d 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -395,6 +395,20 @@ static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_LAST] = "BUG!",
};
+static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = {
+ [IP_VS_TCP_S_NONE] = false,
+ [IP_VS_TCP_S_ESTABLISHED] = true,
+ [IP_VS_TCP_S_SYN_SENT] = true,
+ [IP_VS_TCP_S_SYN_RECV] = true,
+ [IP_VS_TCP_S_FIN_WAIT] = false,
+ [IP_VS_TCP_S_TIME_WAIT] = false,
+ [IP_VS_TCP_S_CLOSE] = false,
+ [IP_VS_TCP_S_CLOSE_WAIT] = false,
+ [IP_VS_TCP_S_LAST_ACK] = false,
+ [IP_VS_TCP_S_LISTEN] = false,
+ [IP_VS_TCP_S_SYNACK] = true,
+};
+
#define sNO IP_VS_TCP_S_NONE
#define sES IP_VS_TCP_S_ESTABLISHED
#define sSS IP_VS_TCP_S_SYN_SENT
@@ -418,6 +432,13 @@ static const char * tcp_state_name(int state)
return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
}
+static bool tcp_state_active(int state)
+{
+ if (state >= IP_VS_TCP_S_LAST)
+ return false;
+ return tcp_state_active_table[state];
+}
+
static struct tcp_states_t tcp_states [] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
@@ -540,12 +561,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
if (dest) {
if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+ !tcp_state_active(new_state)) {
atomic_dec(&dest->activeconns);
atomic_inc(&dest->inactconns);
cp->flags |= IP_VS_CONN_F_INACTIVE;
} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+ tcp_state_active(new_state)) {
atomic_inc(&dest->activeconns);
atomic_dec(&dest->inactconns);
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9f530adad..9934b0c93 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -83,6 +83,13 @@ void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
spin_lock(lock);
while (unlikely(nf_conntrack_locks_all)) {
spin_unlock(lock);
+
+ /*
+ * Order the 'nf_conntrack_locks_all' load vs. the
+ * spin_unlock_wait() loads below, to ensure
+ * that 'nf_conntrack_locks_all_lock' is indeed held:
+ */
+ smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */
spin_unlock_wait(&nf_conntrack_locks_all_lock);
spin_lock(lock);
}
@@ -128,6 +135,14 @@ static void nf_conntrack_all_lock(void)
spin_lock(&nf_conntrack_locks_all_lock);
nf_conntrack_locks_all = true;
+ /*
+ * Order the above store of 'nf_conntrack_locks_all' against
+ * the spin_unlock_wait() loads below, such that if
+ * nf_conntrack_lock() observes 'nf_conntrack_locks_all'
+ * we must observe nf_conntrack_locks[] held:
+ */
+ smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */
+
for (i = 0; i < CONNTRACK_LOCKS; i++) {
spin_unlock_wait(&nf_conntrack_locks[i]);
}
@@ -135,7 +150,13 @@ static void nf_conntrack_all_lock(void)
static void nf_conntrack_all_unlock(void)
{
- nf_conntrack_locks_all = false;
+ /*
+ * All prior stores must be complete before we clear
+ * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
+ * might observe the false value but not the entire
+ * critical section:
+ */
+ smp_store_release(&nf_conntrack_locks_all, false);
spin_unlock(&nf_conntrack_locks_all_lock);
}
@@ -327,16 +348,10 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
tmpl->status = IPS_TEMPLATE;
write_pnet(&tmpl->ct_net, net);
-
- if (nf_ct_zone_add(tmpl, flags, zone) < 0)
- goto out_free;
-
+ nf_ct_zone_add(tmpl, zone);
atomic_set(&tmpl->ct_general.use, 0);
return tmpl;
-out_free:
- kfree(tmpl);
- return NULL;
}
EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
@@ -466,6 +481,23 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
net_eq(net, nf_ct_net(ct));
}
+/* must be called with rcu read lock held */
+void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
+{
+ struct hlist_nulls_head *hptr;
+ unsigned int sequence, hsz;
+
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ hsz = nf_conntrack_htable_size;
+ hptr = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ *hash = hptr;
+ *hsize = hsz;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
+
/*
* Warning :
* - Caller must take a reference on returned object
@@ -824,67 +856,69 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
/* There's a small race here where we may free a just-assured
connection. Too bad: we're in trouble anyway. */
-static noinline int early_drop(struct net *net, unsigned int _hash)
+static unsigned int early_drop_list(struct net *net,
+ struct hlist_nulls_head *head)
{
- /* Use oldest entry, which is roughly LRU */
struct nf_conntrack_tuple_hash *h;
- struct nf_conn *tmp;
struct hlist_nulls_node *n;
- unsigned int i, hash, sequence;
- struct nf_conn *ct = NULL;
- spinlock_t *lockp;
- bool ret = false;
+ unsigned int drops = 0;
+ struct nf_conn *tmp;
- i = 0;
+ hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
+ tmp = nf_ct_tuplehash_to_ctrack(h);
- local_bh_disable();
-restart:
- sequence = read_seqcount_begin(&nf_conntrack_generation);
- for (; i < NF_CT_EVICTION_RANGE; i++) {
- hash = scale_hash(_hash++);
- lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
- nf_conntrack_lock(lockp);
- if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
- spin_unlock(lockp);
- goto restart;
- }
- hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
- hnnode) {
- tmp = nf_ct_tuplehash_to_ctrack(h);
-
- if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
- !net_eq(nf_ct_net(tmp), net) ||
- nf_ct_is_dying(tmp))
- continue;
-
- if (atomic_inc_not_zero(&tmp->ct_general.use)) {
- ct = tmp;
- break;
- }
- }
+ if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
+ !net_eq(nf_ct_net(tmp), net) ||
+ nf_ct_is_dying(tmp))
+ continue;
- spin_unlock(lockp);
- if (ct)
- break;
+ if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ continue;
+
+ /* kill only if still in same netns -- might have moved due to
+ * SLAB_DESTROY_BY_RCU rules.
+ *
+ * We steal the timer reference. If that fails timer has
+ * already fired or someone else deleted it. Just drop ref
+ * and move to next entry.
+ */
+ if (net_eq(nf_ct_net(tmp), net) &&
+ nf_ct_is_confirmed(tmp) &&
+ del_timer(&tmp->timeout) &&
+ nf_ct_delete(tmp, 0, 0))
+ drops++;
+
+ nf_ct_put(tmp);
}
- local_bh_enable();
+ return drops;
+}
- if (!ct)
- return false;
+static noinline int early_drop(struct net *net, unsigned int _hash)
+{
+ unsigned int i;
- /* kill only if in same netns -- might have moved due to
- * SLAB_DESTROY_BY_RCU rules
- */
- if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) {
- if (nf_ct_delete(ct, 0, 0)) {
- NF_CT_STAT_INC_ATOMIC(net, early_drop);
- ret = true;
+ for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
+ struct hlist_nulls_head *ct_hash;
+ unsigned hash, sequence, drops;
+
+ rcu_read_lock();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ hash = scale_hash(_hash++);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ drops = early_drop_list(net, &ct_hash[hash]);
+ rcu_read_unlock();
+
+ if (drops) {
+ NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
+ return true;
}
}
- nf_ct_put(ct);
- return ret;
+ return false;
}
static struct nf_conn *
@@ -930,16 +964,13 @@ __nf_conntrack_alloc(struct net *net,
offsetof(struct nf_conn, proto) -
offsetof(struct nf_conn, __nfct_init_offset[0]));
- if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0)
- goto out_free;
+ nf_ct_zone_add(ct, zone);
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
atomic_set(&ct->ct_general.use, 0);
return ct;
-out_free:
- kmem_cache_free(nf_conntrack_cachep, ct);
out:
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
@@ -1004,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
if (IS_ERR(ct))
return (struct nf_conntrack_tuple_hash *)ct;
- if (tmpl && nfct_synproxy(tmpl)) {
- nfct_seqadj_ext_add(ct);
- nfct_synproxy_ext_add(ct);
+ if (!nf_ct_add_synproxy(ct, tmpl)) {
+ nf_conntrack_free(ct);
+ return ERR_PTR(-ENOMEM);
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
@@ -1343,14 +1374,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
}
EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
- .len = sizeof(struct nf_conntrack_zone),
- .align = __alignof__(struct nf_conntrack_zone),
- .id = NF_CT_EXT_ZONE,
-};
-#endif
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@@ -1533,9 +1556,6 @@ void nf_conntrack_cleanup_end(void)
nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- nf_ct_extend_unregister(&nf_ct_zone_extend);
-#endif
nf_conntrack_proto_fini();
nf_conntrack_seqadj_fini();
nf_conntrack_labels_fini();
@@ -1625,24 +1645,14 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
}
EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
-int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+int nf_conntrack_hash_resize(unsigned int hashsize)
{
- int i, bucket, rc;
- unsigned int hashsize, old_size;
+ int i, bucket;
+ unsigned int old_size;
struct hlist_nulls_head *hash, *old_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
- if (current->nsproxy->net_ns != &init_net)
- return -EOPNOTSUPP;
-
- /* On boot, we can set this without any fancy locking. */
- if (!nf_conntrack_htable_size)
- return param_set_uint(val, kp);
-
- rc = kstrtouint(val, 0, &hashsize);
- if (rc)
- return rc;
if (!hashsize)
return -EINVAL;
@@ -1650,6 +1660,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
if (!hash)
return -ENOMEM;
+ old_size = nf_conntrack_htable_size;
+ if (old_size == hashsize) {
+ nf_ct_free_hashtable(hash, hashsize);
+ return 0;
+ }
+
local_bh_disable();
nf_conntrack_all_lock();
write_seqcount_begin(&nf_conntrack_generation);
@@ -1685,6 +1701,25 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
+
+int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+{
+ unsigned int hashsize;
+ int rc;
+
+ if (current->nsproxy->net_ns != &init_net)
+ return -EOPNOTSUPP;
+
+ /* On boot, we can set this without any fancy locking. */
+ if (!nf_conntrack_htable_size)
+ return param_set_uint(val, kp);
+
+ rc = kstrtouint(val, 0, &hashsize);
+ if (rc)
+ return rc;
+
+ return nf_conntrack_hash_resize(hashsize);
+}
EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
@@ -1741,7 +1776,7 @@ int nf_conntrack_init_start(void)
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
sizeof(struct nf_conn), 0,
- SLAB_DESTROY_BY_RCU, NULL);
+ SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
if (!nf_conntrack_cachep)
goto err_cachep;
@@ -1781,11 +1816,6 @@ int nf_conntrack_init_start(void)
if (ret < 0)
goto err_seqadj;
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- ret = nf_ct_extend_register(&nf_ct_zone_extend);
- if (ret < 0)
- goto err_extend;
-#endif
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -1801,10 +1831,6 @@ int nf_conntrack_init_start(void)
return 0;
err_proto:
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- nf_ct_extend_unregister(&nf_ct_zone_extend);
-err_extend:
-#endif
nf_conntrack_seqadj_fini();
err_seqadj:
nf_conntrack_labels_fini();
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 9e3693128..f8dbacf66 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -574,7 +574,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
helper = rcu_dereference(nfct_help(expect->master)->helper);
if (helper) {
seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
- if (helper->expect_policy[expect->class].name)
+ if (helper->expect_policy[expect->class].name[0])
seq_printf(s, "/%s",
helper->expect_policy[expect->class].name);
}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 1a9545965..02bcf00c2 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -73,7 +73,7 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
size_t var_alloc_len, gfp_t gfp)
{
struct nf_ct_ext *old, *new;
- int i, newlen, newoff;
+ int newlen, newoff;
struct nf_ct_ext_type *t;
/* Conntrack must not be confirmed to avoid races on reallocation. */
@@ -99,19 +99,8 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
return NULL;
if (new != old) {
- for (i = 0; i < NF_CT_EXT_NUM; i++) {
- if (!__nf_ct_ext_exist(old, i))
- continue;
-
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[i]);
- if (t && t->move)
- t->move((void *)new + new->offset[i],
- (void *)old + old->offset[i]);
- rcu_read_unlock();
- }
kfree_rcu(old, rcu);
- ct->ext = new;
+ rcu_assign_pointer(ct->ext, new);
}
new->offset[id] = newoff;
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 19efeba02..43147005b 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -572,7 +572,7 @@ static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
return 0;
}
-static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly;
+static struct nf_conntrack_helper ftp[MAX_PORTS * 2] __read_mostly;
static const struct nf_conntrack_expect_policy ftp_exp_policy = {
.max_expected = 1,
@@ -582,24 +582,13 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
/* don't make this __exit, since it's called from __init ! */
static void nf_conntrack_ftp_fini(void)
{
- int i, j;
- for (i = 0; i < ports_c; i++) {
- for (j = 0; j < 2; j++) {
- if (ftp[i][j].me == NULL)
- continue;
-
- pr_debug("unregistering helper for pf: %d port: %d\n",
- ftp[i][j].tuple.src.l3num, ports[i]);
- nf_conntrack_helper_unregister(&ftp[i][j]);
- }
- }
-
+ nf_conntrack_helpers_unregister(ftp, ports_c * 2);
kfree(ftp_buffer);
}
static int __init nf_conntrack_ftp_init(void)
{
- int i, j = -1, ret = 0;
+ int i, ret = 0;
ftp_buffer = kmalloc(65536, GFP_KERNEL);
if (!ftp_buffer)
@@ -611,32 +600,21 @@ static int __init nf_conntrack_ftp_init(void)
/* FIXME should be configurable whether IPv4 and IPv6 FTP connections
are tracked or not - YK */
for (i = 0; i < ports_c; i++) {
- ftp[i][0].tuple.src.l3num = PF_INET;
- ftp[i][1].tuple.src.l3num = PF_INET6;
- for (j = 0; j < 2; j++) {
- ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master);
- ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
- ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
- ftp[i][j].expect_policy = &ftp_exp_policy;
- ftp[i][j].me = THIS_MODULE;
- ftp[i][j].help = help;
- ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr;
- if (ports[i] == FTP_PORT)
- sprintf(ftp[i][j].name, "ftp");
- else
- sprintf(ftp[i][j].name, "ftp-%d", ports[i]);
-
- pr_debug("registering helper for pf: %d port: %d\n",
- ftp[i][j].tuple.src.l3num, ports[i]);
- ret = nf_conntrack_helper_register(&ftp[i][j]);
- if (ret) {
- pr_err("failed to register helper for pf: %d port: %d\n",
- ftp[i][j].tuple.src.l3num, ports[i]);
- ports_c = i;
- nf_conntrack_ftp_fini();
- return ret;
- }
- }
+ nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, "ftp",
+ FTP_PORT, ports[i], ports[i], &ftp_exp_policy,
+ 0, sizeof(struct nf_ct_ftp_master), help,
+ nf_ct_ftp_from_nlattr, THIS_MODULE);
+ nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, "ftp",
+ FTP_PORT, ports[i], ports[i], &ftp_exp_policy,
+ 0, sizeof(struct nf_ct_ftp_master), help,
+ nf_ct_ftp_from_nlattr, THIS_MODULE);
+ }
+
+ ret = nf_conntrack_helpers_register(ftp, ports_c * 2);
+ if (ret < 0) {
+ pr_err("failed to register helpers\n");
+ kfree(ftp_buffer);
+ return ret;
}
return 0;
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index bcd5ed6b7..89b2e4692 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -846,9 +846,10 @@ int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931)
sz -= len;
/* Message Type */
- if (sz < 1)
+ if (sz < 2)
return H323_ERROR_BOUND;
q931->MessageType = *p++;
+ sz--;
PRINT("MessageType = %02X\n", q931->MessageType);
if (*p & 0x80) {
p++;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 9511af04d..5c0db5c64 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1273,19 +1273,6 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
}
/****************************************************************************/
-static int set_expect_timeout(struct nf_conntrack_expect *exp,
- unsigned int timeout)
-{
- if (!exp || !del_timer(&exp->timeout))
- return 0;
-
- exp->timeout.expires = jiffies + timeout * HZ;
- add_timer(&exp->timeout);
-
- return 1;
-}
-
-/****************************************************************************/
static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int protoff, unsigned char **data,
@@ -1486,7 +1473,8 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
"timeout to %u seconds for",
info->timeout);
nf_ct_dump_tuple(&exp->tuple);
- set_expect_timeout(exp, info->timeout);
+ mod_timer_pending(&exp->timeout,
+ jiffies + info->timeout * HZ);
}
spin_unlock_bh(&nf_conntrack_expect_lock);
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 196cb3964..b989b81ac 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -389,11 +389,40 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
struct net *net)
{
struct nf_conntrack_tuple_hash *h;
+ const struct hlist_nulls_node *nn;
+ int cpu;
+
+ /* Get rid of expecteds, set helpers to NULL. */
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
+ unhelp(h, me);
+ spin_unlock_bh(&pcpu->lock);
+ }
+}
+
+void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+{
+ struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_expect *exp;
const struct hlist_node *next;
const struct hlist_nulls_node *nn;
+ unsigned int last_hsize;
+ spinlock_t *lock;
+ struct net *net;
unsigned int i;
- int cpu;
+
+ mutex_lock(&nf_ct_helper_mutex);
+ hlist_del_rcu(&me->hnode);
+ nf_ct_helper_count--;
+ mutex_unlock(&nf_ct_helper_mutex);
+
+ /* Make sure every nothing is still using the helper unless its a
+ * connection in the hash.
+ */
+ synchronize_rcu();
/* Get rid of expectations */
spin_lock_bh(&nf_conntrack_expect_lock);
@@ -413,47 +442,85 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
}
spin_unlock_bh(&nf_conntrack_expect_lock);
- /* Get rid of expecteds, set helpers to NULL. */
- for_each_possible_cpu(cpu) {
- struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+ rtnl_lock();
+ for_each_net(net)
+ __nf_conntrack_helper_unregister(me, net);
+ rtnl_unlock();
- spin_lock_bh(&pcpu->lock);
- hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
- unhelp(h, me);
- spin_unlock_bh(&pcpu->lock);
- }
local_bh_disable();
- for (i = 0; i < nf_conntrack_htable_size; i++) {
- nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < nf_conntrack_htable_size) {
- hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
- unhelp(h, me);
+restart:
+ last_hsize = nf_conntrack_htable_size;
+ for (i = 0; i < last_hsize; i++) {
+ lock = &nf_conntrack_locks[i % CONNTRACK_LOCKS];
+ nf_conntrack_lock(lock);
+ if (last_hsize != nf_conntrack_htable_size) {
+ spin_unlock(lock);
+ goto restart;
}
- spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
+ unhelp(h, me);
+ spin_unlock(lock);
}
local_bh_enable();
}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
-void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+void nf_ct_helper_init(struct nf_conntrack_helper *helper,
+ u16 l3num, u16 protonum, const char *name,
+ u16 default_port, u16 spec_port, u32 id,
+ const struct nf_conntrack_expect_policy *exp_pol,
+ u32 expect_class_max, u32 data_len,
+ int (*help)(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo),
+ int (*from_nlattr)(struct nlattr *attr,
+ struct nf_conn *ct),
+ struct module *module)
{
- struct net *net;
+ helper->tuple.src.l3num = l3num;
+ helper->tuple.dst.protonum = protonum;
+ helper->tuple.src.u.all = htons(spec_port);
+ helper->expect_policy = exp_pol;
+ helper->expect_class_max = expect_class_max;
+ helper->data_len = data_len;
+ helper->help = help;
+ helper->from_nlattr = from_nlattr;
+ helper->me = module;
+
+ if (spec_port == default_port)
+ snprintf(helper->name, sizeof(helper->name), "%s", name);
+ else
+ snprintf(helper->name, sizeof(helper->name), "%s-%u", name, id);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_init);
- mutex_lock(&nf_ct_helper_mutex);
- hlist_del_rcu(&me->hnode);
- nf_ct_helper_count--;
- mutex_unlock(&nf_ct_helper_mutex);
+int nf_conntrack_helpers_register(struct nf_conntrack_helper *helper,
+ unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
- /* Make sure every nothing is still using the helper unless its a
- * connection in the hash.
- */
- synchronize_rcu();
+ for (i = 0; i < n; i++) {
+ err = nf_conntrack_helper_register(&helper[i]);
+ if (err < 0)
+ goto err;
+ }
- rtnl_lock();
- for_each_net(net)
- __nf_conntrack_helper_unregister(me, net);
- rtnl_unlock();
+ return err;
+err:
+ if (i > 0)
+ nf_conntrack_helpers_unregister(helper, i);
+ return err;
}
-EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
+EXPORT_SYMBOL_GPL(nf_conntrack_helpers_register);
+
+void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper,
+ unsigned int n)
+{
+ while (n-- > 0)
+ nf_conntrack_helper_unregister(&helper[n]);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister);
static struct nf_ct_ext_type helper_extend __read_mostly = {
.len = sizeof(struct nf_conn_help),
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index f97ac61d2..1972a149f 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -255,27 +255,18 @@ static int __init nf_conntrack_irc_init(void)
ports[ports_c++] = IRC_PORT;
for (i = 0; i < ports_c; i++) {
- irc[i].tuple.src.l3num = AF_INET;
- irc[i].tuple.src.u.tcp.port = htons(ports[i]);
- irc[i].tuple.dst.protonum = IPPROTO_TCP;
- irc[i].expect_policy = &irc_exp_policy;
- irc[i].me = THIS_MODULE;
- irc[i].help = help;
-
- if (ports[i] == IRC_PORT)
- sprintf(irc[i].name, "irc");
- else
- sprintf(irc[i].name, "irc-%u", i);
-
- ret = nf_conntrack_helper_register(&irc[i]);
- if (ret) {
- pr_err("failed to register helper for pf: %u port: %u\n",
- irc[i].tuple.src.l3num, ports[i]);
- ports_c = i;
- nf_conntrack_irc_fini();
- return ret;
- }
+ nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, "irc",
+ IRC_PORT, ports[i], i, &irc_exp_policy,
+ 0, 0, help, NULL, THIS_MODULE);
+ }
+
+ ret = nf_conntrack_helpers_register(&irc[0], ports_c);
+ if (ret) {
+ pr_err("failed to register helpers\n");
+ kfree(irc_buffer);
+ return ret;
}
+
return 0;
}
@@ -283,10 +274,7 @@ static int __init nf_conntrack_irc_init(void)
* it is needed by the init function */
static void nf_conntrack_irc_fini(void)
{
- int i;
-
- for (i = 0; i < ports_c; i++)
- nf_conntrack_helper_unregister(&irc[i]);
+ nf_conntrack_helpers_unregister(irc, ports_c);
kfree(irc_buffer);
}
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 252e6a7cd..bcab8bde7 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -16,23 +16,6 @@
static spinlock_t nf_connlabels_lock;
-int nf_connlabel_set(struct nf_conn *ct, u16 bit)
-{
- struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-
- if (!labels || BIT_WORD(bit) >= labels->words)
- return -ENOSPC;
-
- if (test_bit(bit, labels->bits))
- return 0;
-
- if (!test_and_set_bit(bit, labels->bits))
- nf_conntrack_event_cache(IPCT_LABEL, ct);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_connlabel_set);
-
static int replace_u32(u32 *address, u32 mask, u32 new)
{
u32 old, tmp;
@@ -60,7 +43,7 @@ int nf_connlabels_replace(struct nf_conn *ct,
if (!labels)
return -ENOSPC;
- size = labels->words * sizeof(long);
+ size = sizeof(labels->bits);
if (size < (words32 * sizeof(u32)))
words32 = size / sizeof(u32);
@@ -80,16 +63,11 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace);
int nf_connlabels_get(struct net *net, unsigned int bits)
{
- size_t words;
-
- words = BIT_WORD(bits) + 1;
- if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long))
+ if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long))
return -ERANGE;
spin_lock(&nf_connlabels_lock);
net->ct.labels_used++;
- if (words > net->ct.label_words)
- net->ct.label_words = words;
spin_unlock(&nf_connlabels_lock);
return 0;
@@ -100,8 +78,6 @@ void nf_connlabels_put(struct net *net)
{
spin_lock(&nf_connlabels_lock);
net->ct.labels_used--;
- if (net->ct.labels_used == 0)
- net->ct.label_words = 0;
spin_unlock(&nf_connlabels_lock);
}
EXPORT_SYMBOL_GPL(nf_connlabels_put);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index a18d1ceab..fdfc71f41 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -346,25 +346,25 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct)
if (!labels)
return 0;
- return nla_total_size(labels->words * sizeof(long));
+ return nla_total_size(sizeof(labels->bits));
}
static int
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
- unsigned int len, i;
+ unsigned int i;
if (!labels)
return 0;
- len = labels->words * sizeof(long);
i = 0;
do {
if (labels->bits[i] != 0)
- return nla_put(skb, CTA_LABELS, len, labels->bits);
+ return nla_put(skb, CTA_LABELS, sizeof(labels->bits),
+ labels->bits);
i++;
- } while (i < labels->words);
+ } while (i < ARRAY_SIZE(labels->bits));
return 0;
}
@@ -1894,6 +1894,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
return -EINVAL;
+ if (otuple.dst.protonum != rtuple.dst.protonum)
+ return -EINVAL;
ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
&rtuple, u3);
@@ -2362,12 +2364,8 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
return PTR_ERR(exp);
err = nf_ct_expect_related_report(exp, portid, report);
- if (err < 0) {
- nf_ct_expect_put(exp);
- return err;
- }
-
- return 0;
+ nf_ct_expect_put(exp);
+ return err;
}
static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 3fcbaab83..9dcb9ee9b 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -166,7 +166,7 @@ out:
return ret;
}
-static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly;
+static struct nf_conntrack_helper sane[MAX_PORTS * 2] __read_mostly;
static const struct nf_conntrack_expect_policy sane_exp_policy = {
.max_expected = 1,
@@ -176,22 +176,13 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
/* don't make this __exit, since it's called from __init ! */
static void nf_conntrack_sane_fini(void)
{
- int i, j;
-
- for (i = 0; i < ports_c; i++) {
- for (j = 0; j < 2; j++) {
- pr_debug("unregistering helper for pf: %d port: %d\n",
- sane[i][j].tuple.src.l3num, ports[i]);
- nf_conntrack_helper_unregister(&sane[i][j]);
- }
- }
-
+ nf_conntrack_helpers_unregister(sane, ports_c * 2);
kfree(sane_buffer);
}
static int __init nf_conntrack_sane_init(void)
{
- int i, j = -1, ret = 0;
+ int i, ret = 0;
sane_buffer = kmalloc(65536, GFP_KERNEL);
if (!sane_buffer)
@@ -203,31 +194,23 @@ static int __init nf_conntrack_sane_init(void)
/* FIXME should be configurable whether IPv4 and IPv6 connections
are tracked or not - YK */
for (i = 0; i < ports_c; i++) {
- sane[i][0].tuple.src.l3num = PF_INET;
- sane[i][1].tuple.src.l3num = PF_INET6;
- for (j = 0; j < 2; j++) {
- sane[i][j].data_len = sizeof(struct nf_ct_sane_master);
- sane[i][j].tuple.src.u.tcp.port = htons(ports[i]);
- sane[i][j].tuple.dst.protonum = IPPROTO_TCP;
- sane[i][j].expect_policy = &sane_exp_policy;
- sane[i][j].me = THIS_MODULE;
- sane[i][j].help = help;
- if (ports[i] == SANE_PORT)
- sprintf(sane[i][j].name, "sane");
- else
- sprintf(sane[i][j].name, "sane-%d", ports[i]);
-
- pr_debug("registering helper for pf: %d port: %d\n",
- sane[i][j].tuple.src.l3num, ports[i]);
- ret = nf_conntrack_helper_register(&sane[i][j]);
- if (ret) {
- pr_err("failed to register helper for pf: %d port: %d\n",
- sane[i][j].tuple.src.l3num, ports[i]);
- ports_c = i;
- nf_conntrack_sane_fini();
- return ret;
- }
- }
+ nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, "sane",
+ SANE_PORT, ports[i], ports[i],
+ &sane_exp_policy, 0,
+ sizeof(struct nf_ct_sane_master), help, NULL,
+ THIS_MODULE);
+ nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, "sane",
+ SANE_PORT, ports[i], ports[i],
+ &sane_exp_policy, 0,
+ sizeof(struct nf_ct_sane_master), help, NULL,
+ THIS_MODULE);
+ }
+
+ ret = nf_conntrack_helpers_register(sane, ports_c * 2);
+ if (ret < 0) {
+ pr_err("failed to register helpers\n");
+ kfree(sane_buffer);
+ return ret;
}
return 0;
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index f72ba5587..7d77217de 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1383,7 +1383,7 @@ static int process_sip_response(struct sk_buff *skb, unsigned int protoff,
return NF_DROP;
}
cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
- if (!cseq) {
+ if (!cseq && *(*dptr + matchoff) != '0') {
nf_ct_helper_log(skb, ct, "cannot get cseq");
return NF_DROP;
}
@@ -1446,7 +1446,7 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff,
return NF_DROP;
}
cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
- if (!cseq) {
+ if (!cseq && *(*dptr + matchoff) != '0') {
nf_ct_helper_log(skb, ct, "cannot get cseq");
return NF_DROP;
}
@@ -1589,7 +1589,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen);
}
-static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly;
+static struct nf_conntrack_helper sip[MAX_PORTS * 4] __read_mostly;
static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = {
[SIP_EXPECT_SIGNALLING] = {
@@ -1616,20 +1616,12 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1
static void nf_conntrack_sip_fini(void)
{
- int i, j;
-
- for (i = 0; i < ports_c; i++) {
- for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
- if (sip[i][j].me == NULL)
- continue;
- nf_conntrack_helper_unregister(&sip[i][j]);
- }
- }
+ nf_conntrack_helpers_unregister(sip, ports_c * 4);
}
static int __init nf_conntrack_sip_init(void)
{
- int i, j, ret;
+ int i, ret;
if (ports_c == 0)
ports[ports_c++] = SIP_PORT;
@@ -1637,43 +1629,32 @@ static int __init nf_conntrack_sip_init(void)
for (i = 0; i < ports_c; i++) {
memset(&sip[i], 0, sizeof(sip[i]));
- sip[i][0].tuple.src.l3num = AF_INET;
- sip[i][0].tuple.dst.protonum = IPPROTO_UDP;
- sip[i][0].help = sip_help_udp;
- sip[i][1].tuple.src.l3num = AF_INET;
- sip[i][1].tuple.dst.protonum = IPPROTO_TCP;
- sip[i][1].help = sip_help_tcp;
-
- sip[i][2].tuple.src.l3num = AF_INET6;
- sip[i][2].tuple.dst.protonum = IPPROTO_UDP;
- sip[i][2].help = sip_help_udp;
- sip[i][3].tuple.src.l3num = AF_INET6;
- sip[i][3].tuple.dst.protonum = IPPROTO_TCP;
- sip[i][3].help = sip_help_tcp;
-
- for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
- sip[i][j].data_len = sizeof(struct nf_ct_sip_master);
- sip[i][j].tuple.src.u.udp.port = htons(ports[i]);
- sip[i][j].expect_policy = sip_exp_policy;
- sip[i][j].expect_class_max = SIP_EXPECT_MAX;
- sip[i][j].me = THIS_MODULE;
-
- if (ports[i] == SIP_PORT)
- sprintf(sip[i][j].name, "sip");
- else
- sprintf(sip[i][j].name, "sip-%u", i);
-
- pr_debug("port #%u: %u\n", i, ports[i]);
+ nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip",
+ SIP_PORT, ports[i], i, sip_exp_policy,
+ SIP_EXPECT_MAX,
+ sizeof(struct nf_ct_sip_master), sip_help_udp,
+ NULL, THIS_MODULE);
+ nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, "sip",
+ SIP_PORT, ports[i], i, sip_exp_policy,
+ SIP_EXPECT_MAX,
+ sizeof(struct nf_ct_sip_master), sip_help_tcp,
+ NULL, THIS_MODULE);
+ nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, "sip",
+ SIP_PORT, ports[i], i, sip_exp_policy,
+ SIP_EXPECT_MAX,
+ sizeof(struct nf_ct_sip_master), sip_help_udp,
+ NULL, THIS_MODULE);
+ nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, "sip",
+ SIP_PORT, ports[i], i, sip_exp_policy,
+ SIP_EXPECT_MAX,
+ sizeof(struct nf_ct_sip_master), sip_help_tcp,
+ NULL, THIS_MODULE);
+ }
- ret = nf_conntrack_helper_register(&sip[i][j]);
- if (ret) {
- pr_err("failed to register helper for pf: %u port: %u\n",
- sip[i][j].tuple.src.l3num, ports[i]);
- ports_c = i;
- nf_conntrack_sip_fini();
- return ret;
- }
- }
+ ret = nf_conntrack_helpers_register(sip, ports_c * 4);
+ if (ret < 0) {
+ pr_err("failed to register helpers\n");
+ return ret;
}
return 0;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index c026c472e..9f267c3ff 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -48,6 +48,8 @@ EXPORT_SYMBOL_GPL(print_tuple);
struct ct_iter_state {
struct seq_net_private p;
+ struct hlist_nulls_head *hash;
+ unsigned int htable_size;
unsigned int bucket;
u_int64_t time_now;
};
@@ -58,9 +60,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < nf_conntrack_htable_size;
+ st->bucket < st->htable_size;
st->bucket++) {
- n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+ n = rcu_dereference(
+ hlist_nulls_first_rcu(&st->hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -75,12 +78,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= nf_conntrack_htable_size)
+ if (++st->bucket >= st->htable_size)
return NULL;
}
head = rcu_dereference(
- hlist_nulls_first_rcu(
- &nf_conntrack_hash[st->bucket]));
+ hlist_nulls_first_rcu(&st->hash[st->bucket]));
}
return head;
}
@@ -102,6 +104,8 @@ static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
st->time_now = ktime_get_real_ns();
rcu_read_lock();
+
+ nf_conntrack_get_ht(&st->hash, &st->htable_size);
return ct_get_idx(seq, *pos);
}
@@ -201,6 +205,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
+ struct net *net = seq_file_net(s);
int ret = 0;
NF_CT_ASSERT(ct);
@@ -211,6 +216,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (NF_CT_DIRECTION(hash))
goto release;
+ if (!net_eq(nf_ct_net(ct), net))
+ goto release;
+
l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
NF_CT_ASSERT(l3proto);
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
@@ -434,8 +442,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
#ifdef CONFIG_SYSCTL
/* Log invalid packets of a given protocol */
-static int log_invalid_proto_min = 0;
-static int log_invalid_proto_max = 255;
+static int log_invalid_proto_min __read_mostly;
+static int log_invalid_proto_max __read_mostly = 255;
+
+/* size the user *wants to set */
+static unsigned int nf_conntrack_htable_size_user __read_mostly;
+
+static int
+nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret < 0 || !write)
+ return ret;
+
+ /* update ret, we might not be able to satisfy request */
+ ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user);
+
+ /* update it to the actual value used by conntrack */
+ nf_conntrack_htable_size_user = nf_conntrack_htable_size;
+ return ret;
+}
static struct ctl_table_header *nf_ct_netfilter_header;
@@ -456,10 +485,10 @@ static struct ctl_table nf_ct_sysctl_table[] = {
},
{
.procname = "nf_conntrack_buckets",
- .data = &nf_conntrack_htable_size,
+ .data = &nf_conntrack_htable_size_user,
.maxlen = sizeof(unsigned int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
+ .mode = 0644,
+ .proc_handler = nf_conntrack_hash_sysctl,
},
{
.procname = "nf_conntrack_checksum",
@@ -515,6 +544,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
if (net->user_ns != &init_user_ns)
table[0].procname = NULL;
+ if (!net_eq(&init_net, net))
+ table[2].mode = 0444;
+
net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
if (!net->ct.sysctl_header)
goto out_unregister_netfilter;
@@ -604,6 +636,8 @@ static int __init nf_conntrack_standalone_init(void)
ret = -ENOMEM;
goto out_sysctl;
}
+
+ nf_conntrack_htable_size_user = nf_conntrack_htable_size;
#endif
ret = register_pernet_subsys(&nf_conntrack_net_ops);
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 2e65b5430..b1227dc6f 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -97,7 +97,7 @@ static int tftp_help(struct sk_buff *skb,
return ret;
}
-static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly;
+static struct nf_conntrack_helper tftp[MAX_PORTS * 2] __read_mostly;
static const struct nf_conntrack_expect_policy tftp_exp_policy = {
.max_expected = 1,
@@ -106,47 +106,29 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = {
static void nf_conntrack_tftp_fini(void)
{
- int i, j;
-
- for (i = 0; i < ports_c; i++) {
- for (j = 0; j < 2; j++)
- nf_conntrack_helper_unregister(&tftp[i][j]);
- }
+ nf_conntrack_helpers_unregister(tftp, ports_c * 2);
}
static int __init nf_conntrack_tftp_init(void)
{
- int i, j, ret;
+ int i, ret;
if (ports_c == 0)
ports[ports_c++] = TFTP_PORT;
for (i = 0; i < ports_c; i++) {
- memset(&tftp[i], 0, sizeof(tftp[i]));
-
- tftp[i][0].tuple.src.l3num = AF_INET;
- tftp[i][1].tuple.src.l3num = AF_INET6;
- for (j = 0; j < 2; j++) {
- tftp[i][j].tuple.dst.protonum = IPPROTO_UDP;
- tftp[i][j].tuple.src.u.udp.port = htons(ports[i]);
- tftp[i][j].expect_policy = &tftp_exp_policy;
- tftp[i][j].me = THIS_MODULE;
- tftp[i][j].help = tftp_help;
-
- if (ports[i] == TFTP_PORT)
- sprintf(tftp[i][j].name, "tftp");
- else
- sprintf(tftp[i][j].name, "tftp-%u", i);
-
- ret = nf_conntrack_helper_register(&tftp[i][j]);
- if (ret) {
- pr_err("failed to register helper for pf: %u port: %u\n",
- tftp[i][j].tuple.src.l3num, ports[i]);
- ports_c = i;
- nf_conntrack_tftp_fini();
- return ret;
- }
- }
+ nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, "tftp",
+ TFTP_PORT, ports[i], i, &tftp_exp_policy,
+ 0, 0, tftp_help, NULL, THIS_MODULE);
+ nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, "tftp",
+ TFTP_PORT, ports[i], i, &tftp_exp_policy,
+ 0, 0, tftp_help, NULL, THIS_MODULE);
+ }
+
+ ret = nf_conntrack_helpers_register(tftp, ports_c * 2);
+ if (ret < 0) {
+ pr_err("failed to register helpers\n");
+ return ret;
}
return 0;
}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index a5d41dfa9..aa5847a16 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -159,6 +159,20 @@ int nf_logger_find_get(int pf, enum nf_log_type type)
struct nf_logger *logger;
int ret = -ENOENT;
+ if (pf == NFPROTO_INET) {
+ ret = nf_logger_find_get(NFPROTO_IPV4, type);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_logger_find_get(NFPROTO_IPV6, type);
+ if (ret < 0) {
+ nf_logger_put(NFPROTO_IPV4, type);
+ return ret;
+ }
+
+ return 0;
+ }
+
if (rcu_access_pointer(loggers[pf][type]) == NULL)
request_module("nf-logger-%u-%u", pf, type);
@@ -167,7 +181,7 @@ int nf_logger_find_get(int pf, enum nf_log_type type)
if (logger == NULL)
goto out;
- if (logger && try_module_get(logger->me))
+ if (try_module_get(logger->me))
ret = 0;
out:
rcu_read_unlock();
@@ -179,6 +193,12 @@ void nf_logger_put(int pf, enum nf_log_type type)
{
struct nf_logger *logger;
+ if (pf == NFPROTO_INET) {
+ nf_logger_put(NFPROTO_IPV4, type);
+ nf_logger_put(NFPROTO_IPV6, type);
+ return;
+ }
+
BUG_ON(loggers[pf][type] == NULL);
rcu_read_lock();
@@ -398,16 +418,17 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
{
const struct nf_logger *logger;
char buf[NFLOGGER_NAME_LEN];
- size_t size = *lenp;
int r = 0;
int tindex = (unsigned long)table->extra1;
struct net *net = current->nsproxy->net_ns;
if (write) {
- if (size > sizeof(buf))
- size = sizeof(buf);
- if (copy_from_user(buf, buffer, size))
- return -EFAULT;
+ struct ctl_table tmp = *table;
+
+ tmp.data = buf;
+ r = proc_dostring(&tmp, write, buffer, lenp, ppos);
+ if (r)
+ return r;
if (!strcmp(buf, "NONE")) {
nf_log_unbind_pf(net, tindex);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 6877a396f..ecee105bb 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,17 +30,19 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h>
-static DEFINE_SPINLOCK(nf_nat_lock);
-
static DEFINE_MUTEX(nf_nat_proto_mutex);
static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
__read_mostly;
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
-static struct hlist_head *nf_nat_bysource __read_mostly;
-static unsigned int nf_nat_htable_size __read_mostly;
-static unsigned int nf_nat_hash_rnd __read_mostly;
+struct nf_nat_conn_key {
+ const struct net *net;
+ const struct nf_conntrack_tuple *tuple;
+ const struct nf_conntrack_zone *zone;
+};
+
+static struct rhashtable nf_nat_bysource_table;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
@@ -119,19 +121,17 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */
-/* We keep an extra hash for each conntrack, for fast searching. */
-static inline unsigned int
-hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
+static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
{
- unsigned int hash;
-
- get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+ const struct nf_conntrack_tuple *t;
+ const struct nf_conn *ct = data;
+ t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
/* Original src, to ensure we map it consistently if poss. */
- hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- return reciprocal_scale(hash, nf_nat_htable_size);
+ seed ^= net_hash_mix(nf_ct_net(ct));
+ return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
+ t->dst.protonum ^ seed);
}
/* Is this tuple already taken? (not by us) */
@@ -187,6 +187,26 @@ same_src(const struct nf_conn *ct,
t->src.u.all == tuple->src.u.all);
}
+static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct nf_nat_conn_key *key = arg->key;
+ const struct nf_conn *ct = obj;
+
+ return same_src(ct, key->tuple) &&
+ net_eq(nf_ct_net(ct), key->net) &&
+ nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL);
+}
+
+static struct rhashtable_params nf_nat_bysource_params = {
+ .head_offset = offsetof(struct nf_conn, nat_bysource),
+ .obj_hashfn = nf_nat_bysource_hash,
+ .obj_cmpfn = nf_nat_bysource_cmp,
+ .nelem_hint = 256,
+ .min_size = 1024,
+ .nulls_base = (1U << RHT_BASE_SHIFT),
+};
+
/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
@@ -197,25 +217,23 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
- unsigned int h = hash_by_src(net, tuple);
- const struct nf_conn_nat *nat;
const struct nf_conn *ct;
+ struct nf_nat_conn_key key = {
+ .net = net,
+ .tuple = tuple,
+ .zone = zone
+ };
- hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) {
- ct = nat->ct;
- if (same_src(ct, tuple) &&
- net_eq(net, nf_ct_net(ct)) &&
- nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
- /* Copy source part from reply tuple. */
- nf_ct_invert_tuplepr(result,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- result->dst = tuple->dst;
-
- if (in_range(l3proto, l4proto, result, range))
- return 1;
- }
- }
- return 0;
+ ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key,
+ nf_nat_bysource_params);
+ if (!ct)
+ return 0;
+
+ nf_ct_invert_tuplepr(result,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ result->dst = tuple->dst;
+
+ return in_range(l3proto, l4proto, result, range);
}
/* For [FUTURE] fragmentation handling, we want the least-used
@@ -387,7 +405,6 @@ nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype)
{
- struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
struct nf_conn_nat *nat;
@@ -424,21 +441,18 @@ nf_nat_setup_info(struct nf_conn *ct,
ct->status |= IPS_DST_NAT;
if (nfct_help(ct))
- nfct_seqadj_ext_add(ct);
+ if (!nfct_seqadj_ext_add(ct))
+ return NF_DROP;
}
if (maniptype == NF_NAT_MANIP_SRC) {
- unsigned int srchash;
-
- srchash = hash_by_src(net,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- spin_lock_bh(&nf_nat_lock);
- /* nf_conntrack_alter_reply might re-allocate extension aera */
- nat = nfct_nat(ct);
- nat->ct = ct;
- hlist_add_head_rcu(&nat->bysource,
- &nf_nat_bysource[srchash]);
- spin_unlock_bh(&nf_nat_lock);
+ int err;
+
+ err = rhashtable_insert_fast(&nf_nat_bysource_table,
+ &ct->nat_bysource,
+ nf_nat_bysource_params);
+ if (err)
+ return NF_DROP;
}
/* It's done. */
@@ -543,7 +557,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
if (nf_nat_proto_remove(ct, data))
return 1;
- if (!nat || !nat->ct)
+ if (!nat)
return 0;
/* This netns is being destroyed, and conntrack has nat null binding.
@@ -555,11 +569,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
if (!del_timer(&ct->timeout))
return 1;
- spin_lock_bh(&nf_nat_lock);
- hlist_del_rcu(&nat->bysource);
ct->status &= ~IPS_NAT_DONE_MASK;
- nat->ct = NULL;
- spin_unlock_bh(&nf_nat_lock);
+
+ rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
+ nf_nat_bysource_params);
add_timer(&ct->timeout);
@@ -688,35 +701,17 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
- if (nat == NULL || nat->ct == NULL)
- return;
-
- NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
-
- spin_lock_bh(&nf_nat_lock);
- hlist_del_rcu(&nat->bysource);
- spin_unlock_bh(&nf_nat_lock);
-}
-
-static void nf_nat_move_storage(void *new, void *old)
-{
- struct nf_conn_nat *new_nat = new;
- struct nf_conn_nat *old_nat = old;
- struct nf_conn *ct = old_nat->ct;
-
- if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
+ if (!nat)
return;
- spin_lock_bh(&nf_nat_lock);
- hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
- spin_unlock_bh(&nf_nat_lock);
+ rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
+ nf_nat_bysource_params);
}
static struct nf_ct_ext_type nat_extend __read_mostly = {
.len = sizeof(struct nf_conn_nat),
.align = __alignof__(struct nf_conn_nat),
.destroy = nf_nat_cleanup_conntrack,
- .move = nf_nat_move_storage,
.id = NF_CT_EXT_NAT,
.flags = NF_CT_EXT_F_PREALLOC,
};
@@ -813,7 +808,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
if (err < 0)
return err;
- return nf_nat_setup_info(ct, &range, manip);
+ return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
}
#else
static int
@@ -845,16 +840,13 @@ static int __init nf_nat_init(void)
{
int ret;
- /* Leave them the same for the moment. */
- nf_nat_htable_size = nf_conntrack_htable_size;
-
- nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
- if (!nf_nat_bysource)
- return -ENOMEM;
+ ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
+ if (ret)
+ return ret;
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+ rhashtable_destroy(&nf_nat_bysource_table);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
@@ -878,7 +870,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+ rhashtable_destroy(&nf_nat_bysource_table);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
@@ -896,8 +888,8 @@ static void __exit nf_nat_cleanup(void)
#endif
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
- synchronize_net();
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+
+ rhashtable_destroy(&nf_nat_bysource_table);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index cf7c74599..7e1c876c7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -131,29 +131,8 @@ static void nft_trans_destroy(struct nft_trans *trans)
kfree(trans);
}
-static int nft_register_basechain(struct nft_base_chain *basechain,
- unsigned int hook_nops)
-{
- struct net *net = read_pnet(&basechain->pnet);
-
- if (basechain->flags & NFT_BASECHAIN_DISABLED)
- return 0;
-
- return nf_register_net_hooks(net, basechain->ops, hook_nops);
-}
-
-static void nft_unregister_basechain(struct nft_base_chain *basechain,
- unsigned int hook_nops)
-{
- struct net *net = read_pnet(&basechain->pnet);
-
- if (basechain->flags & NFT_BASECHAIN_DISABLED)
- return;
-
- nf_unregister_net_hooks(net, basechain->ops, hook_nops);
-}
-
-static int nf_tables_register_hooks(const struct nft_table *table,
+static int nf_tables_register_hooks(struct net *net,
+ const struct nft_table *table,
struct nft_chain *chain,
unsigned int hook_nops)
{
@@ -161,10 +140,12 @@ static int nf_tables_register_hooks(const struct nft_table *table,
!(chain->flags & NFT_BASE_CHAIN))
return 0;
- return nft_register_basechain(nft_base_chain(chain), hook_nops);
+ return nf_register_net_hooks(net, nft_base_chain(chain)->ops,
+ hook_nops);
}
-static void nf_tables_unregister_hooks(const struct nft_table *table,
+static void nf_tables_unregister_hooks(struct net *net,
+ const struct nft_table *table,
struct nft_chain *chain,
unsigned int hook_nops)
{
@@ -172,12 +153,9 @@ static void nf_tables_unregister_hooks(const struct nft_table *table,
!(chain->flags & NFT_BASE_CHAIN))
return;
- nft_unregister_basechain(nft_base_chain(chain), hook_nops);
+ nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, hook_nops);
}
-/* Internal table flags */
-#define NFT_TABLE_INACTIVE (1 << 15)
-
static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
{
struct nft_trans *trans;
@@ -187,7 +165,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
return -ENOMEM;
if (msg_type == NFT_MSG_NEWTABLE)
- ctx->table->flags |= NFT_TABLE_INACTIVE;
+ nft_activate_next(ctx->net, ctx->table);
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
@@ -201,7 +179,7 @@ static int nft_deltable(struct nft_ctx *ctx)
if (err < 0)
return err;
- list_del_rcu(&ctx->table->list);
+ nft_deactivate_next(ctx->net, ctx->table);
return err;
}
@@ -214,7 +192,7 @@ static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
return -ENOMEM;
if (msg_type == NFT_MSG_NEWCHAIN)
- ctx->chain->flags |= NFT_CHAIN_INACTIVE;
+ nft_activate_next(ctx->net, ctx->chain);
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
@@ -229,47 +207,17 @@ static int nft_delchain(struct nft_ctx *ctx)
return err;
ctx->table->use--;
- list_del_rcu(&ctx->chain->list);
+ nft_deactivate_next(ctx->net, ctx->chain);
return err;
}
-static inline bool
-nft_rule_is_active(struct net *net, const struct nft_rule *rule)
-{
- return (rule->genmask & nft_genmask_cur(net)) == 0;
-}
-
-static inline int
-nft_rule_is_active_next(struct net *net, const struct nft_rule *rule)
-{
- return (rule->genmask & nft_genmask_next(net)) == 0;
-}
-
-static inline void
-nft_rule_activate_next(struct net *net, struct nft_rule *rule)
-{
- /* Now inactive, will be active in the future */
- rule->genmask = nft_genmask_cur(net);
-}
-
-static inline void
-nft_rule_deactivate_next(struct net *net, struct nft_rule *rule)
-{
- rule->genmask = nft_genmask_next(net);
-}
-
-static inline void nft_rule_clear(struct net *net, struct nft_rule *rule)
-{
- rule->genmask &= ~nft_genmask_next(net);
-}
-
static int
nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
{
/* You cannot delete the same rule twice */
- if (nft_rule_is_active_next(ctx->net, rule)) {
- nft_rule_deactivate_next(ctx->net, rule);
+ if (nft_is_active_next(ctx->net, rule)) {
+ nft_deactivate_next(ctx->net, rule);
ctx->chain->use--;
return 0;
}
@@ -322,9 +270,6 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx)
return 0;
}
-/* Internal set flag */
-#define NFT_SET_INACTIVE (1 << 15)
-
static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
struct nft_set *set)
{
@@ -337,7 +282,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) {
nft_trans_set_id(trans) =
ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
- set->flags |= NFT_SET_INACTIVE;
+ nft_activate_next(ctx->net, set);
}
nft_trans_set(trans) = set;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
@@ -353,7 +298,7 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
if (err < 0)
return err;
- list_del_rcu(&set->list);
+ nft_deactivate_next(ctx->net, set);
ctx->table->use--;
return err;
@@ -364,26 +309,29 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
*/
static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
- const struct nlattr *nla)
+ const struct nlattr *nla,
+ u8 genmask)
{
struct nft_table *table;
list_for_each_entry(table, &afi->tables, list) {
- if (!nla_strcmp(nla, table->name))
+ if (!nla_strcmp(nla, table->name) &&
+ nft_active_genmask(table, genmask))
return table;
}
return NULL;
}
static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
- const struct nlattr *nla)
+ const struct nlattr *nla,
+ u8 genmask)
{
struct nft_table *table;
if (nla == NULL)
return ERR_PTR(-EINVAL);
- table = nft_table_lookup(afi, nla);
+ table = nft_table_lookup(afi, nla, genmask);
if (table != NULL)
return table;
@@ -524,6 +472,8 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
if (idx > s_idx)
memset(&cb->args[1], 0,
sizeof(cb->args) - sizeof(cb->args[0]));
+ if (!nft_is_active(net, table))
+ continue;
if (nf_tables_fill_table_info(skb, net,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -548,6 +498,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_cur(net);
const struct nft_af_info *afi;
const struct nft_table *table;
struct sk_buff *skb2;
@@ -565,11 +516,9 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
- if (table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb2)
@@ -588,17 +537,21 @@ err:
return err;
}
-static int nf_tables_table_enable(const struct nft_af_info *afi,
+static int nf_tables_table_enable(struct net *net,
+ const struct nft_af_info *afi,
struct nft_table *table)
{
struct nft_chain *chain;
int err, i = 0;
list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_active_next(net, chain))
+ continue;
if (!(chain->flags & NFT_BASE_CHAIN))
continue;
- err = nft_register_basechain(nft_base_chain(chain), afi->nops);
+ err = nf_register_net_hooks(net, nft_base_chain(chain)->ops,
+ afi->nops);
if (err < 0)
goto err;
@@ -607,26 +560,34 @@ static int nf_tables_table_enable(const struct nft_af_info *afi,
return 0;
err:
list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_active_next(net, chain))
+ continue;
if (!(chain->flags & NFT_BASE_CHAIN))
continue;
if (i-- <= 0)
break;
- nft_unregister_basechain(nft_base_chain(chain), afi->nops);
+ nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
+ afi->nops);
}
return err;
}
-static void nf_tables_table_disable(const struct nft_af_info *afi,
+static void nf_tables_table_disable(struct net *net,
+ const struct nft_af_info *afi,
struct nft_table *table)
{
struct nft_chain *chain;
list_for_each_entry(chain, &table->chains, list) {
- if (chain->flags & NFT_BASE_CHAIN)
- nft_unregister_basechain(nft_base_chain(chain),
- afi->nops);
+ if (!nft_is_active_next(net, chain))
+ continue;
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
+ afi->nops);
}
}
@@ -656,7 +617,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
nft_trans_table_enable(trans) = false;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
- ret = nf_tables_table_enable(ctx->afi, ctx->table);
+ ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table);
if (ret >= 0) {
ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
nft_trans_table_enable(trans) = true;
@@ -678,6 +639,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
const struct nlattr *name;
struct nft_af_info *afi;
struct nft_table *table;
@@ -691,7 +653,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
return PTR_ERR(afi);
name = nla[NFTA_TABLE_NAME];
- table = nf_tables_table_lookup(afi, name);
+ table = nf_tables_table_lookup(afi, name, genmask);
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
@@ -699,8 +661,6 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
}
if (table != NULL) {
- if (table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
if (nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -752,6 +712,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
struct nft_set *set, *ns;
list_for_each_entry(chain, &ctx->table->chains, list) {
+ if (!nft_is_active_next(ctx->net, chain))
+ continue;
+
ctx->chain = chain;
err = nft_delrule_by_chain(ctx);
@@ -760,6 +723,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
}
list_for_each_entry_safe(set, ns, &ctx->table->sets, list) {
+ if (!nft_is_active_next(ctx->net, set))
+ continue;
+
if (set->flags & NFT_SET_ANONYMOUS &&
!list_empty(&set->bindings))
continue;
@@ -770,6 +736,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
}
list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
+ if (!nft_is_active_next(ctx->net, chain))
+ continue;
+
ctx->chain = chain;
err = nft_delchain(ctx);
@@ -795,6 +764,9 @@ static int nft_flush(struct nft_ctx *ctx, int family)
ctx->afi = afi;
list_for_each_entry_safe(table, nt, &afi->tables, list) {
+ if (!nft_is_active_next(ctx->net, table))
+ continue;
+
if (nla[NFTA_TABLE_NAME] &&
nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
continue;
@@ -815,6 +787,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
struct nft_af_info *afi;
struct nft_table *table;
int family = nfmsg->nfgen_family;
@@ -828,7 +801,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
@@ -875,12 +848,14 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
*/
static struct nft_chain *
-nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
+nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
+ u8 genmask)
{
struct nft_chain *chain;
list_for_each_entry(chain, &table->chains, list) {
- if (chain->handle == handle)
+ if (chain->handle == handle &&
+ nft_active_genmask(chain, genmask))
return chain;
}
@@ -888,7 +863,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
}
static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
- const struct nlattr *nla)
+ const struct nlattr *nla,
+ u8 genmask)
{
struct nft_chain *chain;
@@ -896,7 +872,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
return ERR_PTR(-EINVAL);
list_for_each_entry(chain, &table->chains, list) {
- if (!nla_strcmp(nla, chain->name))
+ if (!nla_strcmp(nla, chain->name) &&
+ nft_active_genmask(chain, genmask))
return chain;
}
@@ -1079,6 +1056,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
if (idx > s_idx)
memset(&cb->args[1], 0,
sizeof(cb->args) - sizeof(cb->args[0]));
+ if (!nft_is_active(net, chain))
+ continue;
if (nf_tables_fill_chain_info(skb, net,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -1104,6 +1083,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_cur(net);
const struct nft_af_info *afi;
const struct nft_table *table;
const struct nft_chain *chain;
@@ -1122,17 +1102,13 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
- if (table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
- chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
- if (chain->flags & NFT_CHAIN_INACTIVE)
- return -ENOENT;
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb2)
@@ -1231,6 +1207,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
struct nft_chain *chain;
struct nft_base_chain *basechain = NULL;
struct nlattr *ha[NFTA_HOOK_MAX + 1];
+ u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
struct net_device *dev = NULL;
u8 policy = NF_ACCEPT;
@@ -1247,7 +1224,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
@@ -1256,11 +1233,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
if (nla[NFTA_CHAIN_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
- chain = nf_tables_chain_lookup_byhandle(table, handle);
+ chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
} else {
- chain = nf_tables_chain_lookup(table, name);
+ chain = nf_tables_chain_lookup(table, name, genmask);
if (IS_ERR(chain)) {
if (PTR_ERR(chain) != -ENOENT)
return PTR_ERR(chain);
@@ -1291,16 +1268,20 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
struct nft_stats *stats = NULL;
struct nft_trans *trans;
- if (chain->flags & NFT_CHAIN_INACTIVE)
- return -ENOENT;
if (nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- if (nla[NFTA_CHAIN_HANDLE] && name &&
- !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
- return -EEXIST;
+ if (nla[NFTA_CHAIN_HANDLE] && name) {
+ struct nft_chain *chain2;
+
+ chain2 = nf_tables_chain_lookup(table,
+ nla[NFTA_CHAIN_NAME],
+ genmask);
+ if (IS_ERR(chain2))
+ return PTR_ERR(chain2);
+ }
if (nla[NFTA_CHAIN_COUNTERS]) {
if (!(chain->flags & NFT_BASE_CHAIN))
@@ -1424,7 +1405,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
rcu_assign_pointer(basechain->stats, stats);
}
- write_pnet(&basechain->pnet, net);
basechain->type = type;
chain = &basechain->chain;
@@ -1455,7 +1435,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
chain->table = table;
nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
- err = nf_tables_register_hooks(table, chain, afi->nops);
+ err = nf_tables_register_hooks(net, table, chain, afi->nops);
if (err < 0)
goto err1;
@@ -1468,7 +1448,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
list_add_tail_rcu(&chain->list, &table->chains);
return 0;
err2:
- nf_tables_unregister_hooks(table, chain, afi->nops);
+ nf_tables_unregister_hooks(net, table, chain, afi->nops);
err1:
nf_tables_chain_destroy(chain);
return err;
@@ -1479,6 +1459,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
struct nft_af_info *afi;
struct nft_table *table;
struct nft_chain *chain;
@@ -1489,11 +1470,11 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
- chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
if (chain->use > 0)
@@ -1878,10 +1859,16 @@ err:
return err;
}
+struct nft_rule_dump_ctx {
+ char table[NFT_TABLE_MAXNAMELEN];
+ char chain[NFT_CHAIN_MAXNAMELEN];
+};
+
static int nf_tables_dump_rules(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_rule_dump_ctx *ctx = cb->data;
const struct nft_af_info *afi;
const struct nft_table *table;
const struct nft_chain *chain;
@@ -1898,9 +1885,17 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
continue;
list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (ctx && ctx->table[0] &&
+ strcmp(ctx->table, table->name) != 0)
+ continue;
+
list_for_each_entry_rcu(chain, &table->chains, list) {
+ if (ctx && ctx->chain[0] &&
+ strcmp(ctx->chain, chain->name) != 0)
+ continue;
+
list_for_each_entry_rcu(rule, &chain->rules, list) {
- if (!nft_rule_is_active(net, rule))
+ if (!nft_is_active(net, rule))
goto cont;
if (idx < s_idx)
goto cont;
@@ -1928,11 +1923,18 @@ done:
return skb->len;
}
+static int nf_tables_dump_rules_done(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+ return 0;
+}
+
static int nf_tables_getrule(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_cur(net);
const struct nft_af_info *afi;
const struct nft_table *table;
const struct nft_chain *chain;
@@ -1944,7 +1946,25 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_rules,
+ .done = nf_tables_dump_rules_done,
};
+
+ if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
+ struct nft_rule_dump_ctx *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (nla[NFTA_RULE_TABLE])
+ nla_strlcpy(ctx->table, nla[NFTA_RULE_TABLE],
+ sizeof(ctx->table));
+ if (nla[NFTA_RULE_CHAIN])
+ nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN],
+ sizeof(ctx->chain));
+ c.data = ctx;
+ }
+
return netlink_dump_start(nlsk, skb, nlh, &c);
}
@@ -1952,17 +1972,13 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
- if (table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
- if (chain->flags & NFT_CHAIN_INACTIVE)
- return -ENOENT;
rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
if (IS_ERR(rule))
@@ -2011,6 +2027,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
struct nft_af_info *afi;
struct nft_table *table;
struct nft_chain *chain;
@@ -2031,11 +2048,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
@@ -2104,7 +2121,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
if (rule == NULL)
goto err1;
- nft_rule_activate_next(net, rule);
+ nft_activate_next(net, rule);
rule->handle = handle;
rule->dlen = size;
@@ -2126,14 +2143,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
if (nlh->nlmsg_flags & NLM_F_REPLACE) {
- if (nft_rule_is_active_next(net, old_rule)) {
+ if (nft_is_active_next(net, old_rule)) {
trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
old_rule);
if (trans == NULL) {
err = -ENOMEM;
goto err2;
}
- nft_rule_deactivate_next(net, old_rule);
+ nft_deactivate_next(net, old_rule);
chain->use--;
list_add_tail_rcu(&rule->list, &old_rule->list);
} else {
@@ -2176,6 +2193,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
struct nft_af_info *afi;
struct nft_table *table;
struct nft_chain *chain = NULL;
@@ -2187,12 +2205,13 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
if (nla[NFTA_RULE_CHAIN]) {
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN],
+ genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
}
@@ -2212,6 +2231,9 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
}
} else {
list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_active_next(net, chain))
+ continue;
+
ctx.chain = chain;
err = nft_delrule_by_chain(&ctx);
if (err < 0)
@@ -2341,7 +2363,8 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
- const struct nlattr * const nla[])
+ const struct nlattr * const nla[],
+ u8 genmask)
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi = NULL;
@@ -2357,7 +2380,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
if (afi == NULL)
return -EAFNOSUPPORT;
- table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE],
+ genmask);
if (IS_ERR(table))
return PTR_ERR(table);
}
@@ -2367,7 +2391,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
}
struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
- const struct nlattr *nla)
+ const struct nlattr *nla, u8 genmask)
{
struct nft_set *set;
@@ -2375,22 +2399,27 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
return ERR_PTR(-EINVAL);
list_for_each_entry(set, &table->sets, list) {
- if (!nla_strcmp(nla, set->name))
+ if (!nla_strcmp(nla, set->name) &&
+ nft_active_genmask(set, genmask))
return set;
}
return ERR_PTR(-ENOENT);
}
struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
- const struct nlattr *nla)
+ const struct nlattr *nla,
+ u8 genmask)
{
struct nft_trans *trans;
u32 id = ntohl(nla_get_be32(nla));
list_for_each_entry(trans, &net->nft.commit_list, list) {
+ struct nft_set *set = nft_trans_set(trans);
+
if (trans->msg_type == NFT_MSG_NEWSET &&
- id == nft_trans_set_id(trans))
- return nft_trans_set(trans);
+ id == nft_trans_set_id(trans) &&
+ nft_active_genmask(set, genmask))
+ return set;
}
return ERR_PTR(-ENOENT);
}
@@ -2415,6 +2444,8 @@ cont:
list_for_each_entry(i, &ctx->table->sets, list) {
int tmp;
+ if (!nft_is_active_next(ctx->net, set))
+ continue;
if (!sscanf(i->name, name, &tmp))
continue;
if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE)
@@ -2434,6 +2465,8 @@ cont:
snprintf(set->name, sizeof(set->name), name, min + n);
list_for_each_entry(i, &ctx->table->sets, list) {
+ if (!nft_is_active_next(ctx->net, i))
+ continue;
if (!strcmp(set->name, i->name))
return -ENFILE;
}
@@ -2582,6 +2615,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
list_for_each_entry_rcu(set, &table->sets, list) {
if (idx < s_idx)
goto cont;
+ if (!nft_is_active(net, set))
+ goto cont;
ctx_set = *ctx;
ctx_set.table = table;
@@ -2618,6 +2653,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ u8 genmask = nft_genmask_cur(net);
const struct nft_set *set;
struct nft_ctx ctx;
struct sk_buff *skb2;
@@ -2625,7 +2661,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
int err;
/* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
if (err < 0)
return err;
@@ -2652,11 +2688,9 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
if (!nla[NFTA_SET_TABLE])
return -EINVAL;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
- if (set->flags & NFT_SET_INACTIVE)
- return -ENOENT;
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb2 == NULL)
@@ -2695,6 +2729,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
const struct nft_set_ops *ops;
struct nft_af_info *afi;
struct nft_table *table;
@@ -2792,13 +2827,13 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], genmask);
if (IS_ERR(table))
return PTR_ERR(table);
nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
- set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
+ set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set)) {
if (PTR_ERR(set) != -ENOENT)
return PTR_ERR(set);
@@ -2845,7 +2880,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
}
INIT_LIST_HEAD(&set->bindings);
- write_pnet(&set->pnet, net);
set->ops = ops;
set->ktype = ktype;
set->klen = desc.klen;
@@ -2897,6 +2931,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
struct nft_set *set;
struct nft_ctx ctx;
int err;
@@ -2906,11 +2941,11 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_TABLE] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
if (err < 0)
return err;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
if (!list_empty(&set->bindings))
@@ -2975,7 +3010,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
list_del_rcu(&binding->list);
if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS &&
- !(set->flags & NFT_SET_INACTIVE))
+ nft_is_active(ctx->net, set))
nf_tables_set_destroy(ctx, set);
}
@@ -3031,7 +3066,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
- const struct nlattr * const nla[])
+ const struct nlattr * const nla[],
+ u8 genmask)
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
@@ -3041,7 +3077,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
if (IS_ERR(afi))
return PTR_ERR(afi);
- table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE],
+ genmask);
if (IS_ERR(table))
return PTR_ERR(table);
@@ -3138,6 +3175,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
+ u8 genmask = nft_genmask_cur(net);
const struct nft_set *set;
struct nft_set_dump_args args;
struct nft_ctx ctx;
@@ -3154,17 +3192,14 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
return err;
err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh,
- (void *)nla);
+ (void *)nla, genmask);
if (err < 0)
return err;
- if (ctx.table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+ genmask);
if (IS_ERR(set))
return PTR_ERR(set);
- if (set->flags & NFT_SET_INACTIVE)
- return -ENOENT;
event = NFT_MSG_NEWSETELEM;
event |= NFNL_SUBSYS_NFTABLES << 8;
@@ -3218,21 +3253,19 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ u8 genmask = nft_genmask_cur(net);
const struct nft_set *set;
struct nft_ctx ctx;
int err;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
if (err < 0)
return err;
- if (ctx.table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+ genmask);
if (IS_ERR(set))
return PTR_ERR(set);
- if (set->flags & NFT_SET_INACTIVE)
- return -ENOENT;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
@@ -3525,7 +3558,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
goto err4;
ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
- err = set->ops->insert(set, &elem);
+ err = set->ops->insert(ctx->net, set, &elem);
if (err < 0)
goto err5;
@@ -3550,6 +3583,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ u8 genmask = nft_genmask_next(net);
const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
@@ -3558,15 +3592,17 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
if (err < 0)
return err;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+ genmask);
if (IS_ERR(set)) {
if (nla[NFTA_SET_ELEM_LIST_SET_ID]) {
set = nf_tables_set_lookup_byid(net,
- nla[NFTA_SET_ELEM_LIST_SET_ID]);
+ nla[NFTA_SET_ELEM_LIST_SET_ID],
+ genmask);
}
if (IS_ERR(set))
return PTR_ERR(set);
@@ -3646,7 +3682,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
goto err3;
}
- priv = set->ops->deactivate(set, &elem);
+ priv = set->ops->deactivate(ctx->net, set, &elem);
if (priv == NULL) {
err = -ENOENT;
goto err4;
@@ -3672,6 +3708,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ u8 genmask = nft_genmask_next(net);
const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
@@ -3680,11 +3717,12 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
if (err < 0)
return err;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+ genmask);
if (IS_ERR(set))
return PTR_ERR(set);
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
@@ -3952,36 +3990,40 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
if (!nft_trans_table_enable(trans)) {
- nf_tables_table_disable(trans->ctx.afi,
+ nf_tables_table_disable(net,
+ trans->ctx.afi,
trans->ctx.table);
trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
}
} else {
- trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE;
+ nft_clear(net, trans->ctx.table);
}
nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELTABLE:
+ list_del_rcu(&trans->ctx.table->list);
nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans))
nft_chain_commit_update(trans);
else
- trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE;
+ nft_clear(net, trans->ctx.chain);
nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELCHAIN:
+ list_del_rcu(&trans->ctx.chain->list);
nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
- nf_tables_unregister_hooks(trans->ctx.table,
+ nf_tables_unregister_hooks(trans->ctx.net,
+ trans->ctx.table,
trans->ctx.chain,
trans->ctx.afi->nops);
break;
case NFT_MSG_NEWRULE:
- nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+ nft_clear(trans->ctx.net, nft_trans_rule(trans));
nf_tables_rule_notify(&trans->ctx,
nft_trans_rule(trans),
NFT_MSG_NEWRULE);
@@ -3994,7 +4036,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
NFT_MSG_DELRULE);
break;
case NFT_MSG_NEWSET:
- nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE;
+ nft_clear(net, nft_trans_set(trans));
/* This avoids hitting -EBUSY when deleting the table
* from the transaction.
*/
@@ -4007,13 +4049,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSET:
+ list_del_rcu(&nft_trans_set(trans)->list);
nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
NFT_MSG_DELSET, GFP_KERNEL);
break;
case NFT_MSG_NEWSETELEM:
te = (struct nft_trans_elem *)trans->data;
- te->set->ops->activate(te->set, &te->elem);
+ te->set->ops->activate(net, te->set, &te->elem);
nf_tables_setelem_notify(&trans->ctx, te->set,
&te->elem,
NFT_MSG_NEWSETELEM, 0);
@@ -4078,7 +4121,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
if (nft_trans_table_enable(trans)) {
- nf_tables_table_disable(trans->ctx.afi,
+ nf_tables_table_disable(net,
+ trans->ctx.afi,
trans->ctx.table);
trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
}
@@ -4088,8 +4132,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
}
break;
case NFT_MSG_DELTABLE:
- list_add_tail_rcu(&trans->ctx.table->list,
- &trans->ctx.afi->tables);
+ nft_clear(trans->ctx.net, trans->ctx.table);
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWCHAIN:
@@ -4100,15 +4143,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
} else {
trans->ctx.table->use--;
list_del_rcu(&trans->ctx.chain->list);
- nf_tables_unregister_hooks(trans->ctx.table,
+ nf_tables_unregister_hooks(trans->ctx.net,
+ trans->ctx.table,
trans->ctx.chain,
trans->ctx.afi->nops);
}
break;
case NFT_MSG_DELCHAIN:
trans->ctx.table->use++;
- list_add_tail_rcu(&trans->ctx.chain->list,
- &trans->ctx.table->chains);
+ nft_clear(trans->ctx.net, trans->ctx.chain);
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWRULE:
@@ -4117,7 +4160,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
break;
case NFT_MSG_DELRULE:
trans->ctx.chain->use++;
- nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+ nft_clear(trans->ctx.net, nft_trans_rule(trans));
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSET:
@@ -4126,8 +4169,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
break;
case NFT_MSG_DELSET:
trans->ctx.table->use++;
- list_add_tail_rcu(&nft_trans_set(trans)->list,
- &trans->ctx.table->sets);
+ nft_clear(trans->ctx.net, nft_trans_set(trans));
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSETELEM:
@@ -4139,7 +4181,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
case NFT_MSG_DELSETELEM:
te = (struct nft_trans_elem *)trans->data;
- te->set->ops->activate(te->set, &te->elem);
+ te->set->ops->activate(net, te->set, &te->elem);
te->set->ndeact--;
nft_trans_destroy(trans);
@@ -4274,6 +4316,8 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
}
list_for_each_entry(set, &ctx->table->sets, list) {
+ if (!nft_is_active_next(ctx->net, set))
+ continue;
if (!(set->flags & NFT_SET_MAP) ||
set->dtype != NFT_DATA_VERDICT)
continue;
@@ -4432,6 +4476,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla)
{
+ u8 genmask = nft_genmask_next(ctx->net);
struct nlattr *tb[NFTA_VERDICT_MAX + 1];
struct nft_chain *chain;
int err;
@@ -4464,7 +4509,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
if (!tb[NFTA_VERDICT_CHAIN])
return -EINVAL;
chain = nf_tables_chain_lookup(ctx->table,
- tb[NFTA_VERDICT_CHAIN]);
+ tb[NFTA_VERDICT_CHAIN], genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
if (chain->flags & NFT_BASE_CHAIN)
@@ -4642,7 +4687,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
BUG_ON(!(ctx->chain->flags & NFT_BASE_CHAIN));
- nf_tables_unregister_hooks(ctx->chain->table, ctx->chain,
+ nf_tables_unregister_hooks(ctx->net, ctx->chain->table, ctx->chain,
ctx->afi->nops);
list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
list_del(&rule->list);
@@ -4671,7 +4716,8 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
list_for_each_entry_safe(table, nt, &afi->tables, list) {
list_for_each_entry(chain, &table->chains, list)
- nf_tables_unregister_hooks(table, chain, afi->nops);
+ nf_tables_unregister_hooks(net, table, chain,
+ afi->nops);
/* No packets are walking on these chains anymore. */
ctx.table = table;
list_for_each_entry(chain, &table->chains, list) {
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 5eefe4a35..75d696f11 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -30,7 +30,6 @@ nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
if (!iph)
return;
- iph = ip_hdr(skb);
if (iph->ihl < 5 || iph->version != 4)
return;
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 39eb1cc62..fa24a5b39 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -237,7 +237,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
break;
case NFT_TRACETYPE_POLICY:
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
- info->basechain->policy))
+ htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 1b4de4bd6..d44d89b56 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -326,14 +326,14 @@ static int nfnl_acct_try_del(struct nf_acct *cur)
{
int ret = 0;
- /* we want to avoid races with nfnl_acct_find_get. */
- if (atomic_dec_and_test(&cur->refcnt)) {
+ /* We want to avoid races with nfnl_acct_put. So only when the current
+ * refcnt is 1, we decrease it to 0.
+ */
+ if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) {
/* We are protected by nfnl mutex. */
list_del_rcu(&cur->head);
kfree_rcu(cur, rcu_head);
} else {
- /* still in use, restore reference counter. */
- atomic_inc(&cur->refcnt);
ret = -EBUSY;
}
return ret;
@@ -343,12 +343,12 @@ static int nfnl_acct_del(struct net *net, struct sock *nfnl,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const tb[])
{
- char *acct_name;
- struct nf_acct *cur;
+ struct nf_acct *cur, *tmp;
int ret = -ENOENT;
+ char *acct_name;
if (!tb[NFACCT_NAME]) {
- list_for_each_entry(cur, &net->nfnl_acct_list, head)
+ list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head)
nfnl_acct_try_del(cur);
return 0;
@@ -443,7 +443,7 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
}
EXPORT_SYMBOL_GPL(nfnl_acct_update);
-static void nfnl_overquota_report(struct nf_acct *nfacct)
+static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct)
{
int ret;
struct sk_buff *skb;
@@ -458,11 +458,12 @@ static void nfnl_overquota_report(struct nf_acct *nfacct)
kfree_skb(skb);
return;
}
- netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
+ netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
GFP_ATOMIC);
}
-int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
+int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb,
+ struct nf_acct *nfacct)
{
u64 now;
u64 *quota;
@@ -480,7 +481,7 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
if (now >= *quota &&
!test_and_set_bit(NFACCT_OVERQUOTA_BIT, &nfacct->flags)) {
- nfnl_overquota_report(nfacct);
+ nfnl_overquota_report(net, nfacct);
}
return ret;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 3c84f1432..139e0867e 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -98,31 +98,28 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
break;
}
- l4proto = nf_ct_l4proto_find_get(l3num, l4num);
-
- /* This protocol is not supportted, skip. */
- if (l4proto->l4proto != l4num) {
- ret = -EOPNOTSUPP;
- goto err_proto_put;
- }
-
if (matching) {
if (nlh->nlmsg_flags & NLM_F_REPLACE) {
/* You cannot replace one timeout policy by another of
* different kind, sorry.
*/
if (matching->l3num != l3num ||
- matching->l4proto->l4proto != l4num) {
- ret = -EINVAL;
- goto err_proto_put;
- }
-
- ret = ctnl_timeout_parse_policy(&matching->data,
- l4proto, net,
- cda[CTA_TIMEOUT_DATA]);
- return ret;
+ matching->l4proto->l4proto != l4num)
+ return -EINVAL;
+
+ return ctnl_timeout_parse_policy(&matching->data,
+ matching->l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
}
- ret = -EBUSY;
+
+ return -EBUSY;
+ }
+
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supportted, skip. */
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
goto err_proto_put;
}
@@ -303,16 +300,33 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
{
struct nf_conntrack_tuple_hash *h;
const struct hlist_nulls_node *nn;
- int i;
+ unsigned int last_hsize;
+ spinlock_t *lock;
+ int i, cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
+ untimeout(h, timeout);
+ spin_unlock_bh(&pcpu->lock);
+ }
local_bh_disable();
- for (i = 0; i < nf_conntrack_htable_size; i++) {
- nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < nf_conntrack_htable_size) {
- hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
- untimeout(h, timeout);
+restart:
+ last_hsize = nf_conntrack_htable_size;
+ for (i = 0; i < last_hsize; i++) {
+ lock = &nf_conntrack_locks[i % CONNTRACK_LOCKS];
+ nf_conntrack_lock(lock);
+ if (last_hsize != nf_conntrack_htable_size) {
+ spin_unlock(lock);
+ goto restart;
}
- spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+
+ hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
+ untimeout(h, timeout);
+ spin_unlock(lock);
}
local_bh_enable();
}
@@ -322,16 +336,16 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
{
int ret = 0;
- /* we want to avoid races with nf_ct_timeout_find_get. */
- if (atomic_dec_and_test(&timeout->refcnt)) {
+ /* We want to avoid races with ctnl_timeout_put. So only when the
+ * current refcnt is 1, we decrease it to 0.
+ */
+ if (atomic_cmpxchg(&timeout->refcnt, 1, 0) == 1) {
/* We are protected by nfnl mutex. */
list_del_rcu(&timeout->head);
nf_ct_l4proto_put(timeout->l4proto);
ctnl_untimeout(net, timeout);
kfree_rcu(timeout, rcu_head);
} else {
- /* still in use, restore reference counter. */
- atomic_inc(&timeout->refcnt);
ret = -EBUSY;
}
return ret;
@@ -342,12 +356,13 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
- struct ctnl_timeout *cur;
+ struct ctnl_timeout *cur, *tmp;
int ret = -ENOENT;
char *name;
if (!cda[CTA_TIMEOUT_NAME]) {
- list_for_each_entry(cur, &net->nfct_timeout_list, head)
+ list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list,
+ head)
ctnl_timeout_try_del(net, cur);
return 0;
@@ -535,7 +550,9 @@ err:
static void ctnl_timeout_put(struct ctnl_timeout *timeout)
{
- atomic_dec(&timeout->refcnt);
+ if (atomic_dec_and_test(&timeout->refcnt))
+ kfree_rcu(timeout, rcu_head);
+
module_put(THIS_MODULE);
}
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
@@ -583,7 +600,9 @@ static void __net_exit cttimeout_net_exit(struct net *net)
list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
list_del_rcu(&cur->head);
nf_ct_l4proto_put(cur->l4proto);
- kfree_rcu(cur, rcu_head);
+
+ if (atomic_dec_and_test(&cur->refcnt))
+ kfree_rcu(cur, rcu_head);
}
}
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 11f81c838..6577db524 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -700,10 +700,13 @@ nfulnl_log_packet(struct net *net,
break;
case NFULNL_COPY_PACKET:
- if (inst->copy_range > skb->len)
+ data_len = inst->copy_range;
+ if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) &&
+ (li->u.ulog.copy_len < data_len))
+ data_len = li->u.ulog.copy_len;
+
+ if (data_len > skb->len)
data_len = skb->len;
- else
- data_len = inst->copy_range;
size += nla_total_size(data_len);
break;
@@ -1144,6 +1147,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
+MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */
module_init(nfnetlink_log_init);
module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 5d36a0926..f49f45081 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1145,10 +1145,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int err;
- queue = instance_lookup(q, queue_num);
- if (!queue)
- queue = verdict_instance_lookup(q, queue_num,
- NETLINK_CB(skb).portid);
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
if (IS_ERR(queue))
return PTR_ERR(queue);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 6228c422c..c21e7eb8d 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -23,6 +23,20 @@
#include <linux/netfilter_arp/arp_tables.h>
#include <net/netfilter/nf_tables.h>
+struct nft_xt {
+ struct list_head head;
+ struct nft_expr_ops ops;
+ unsigned int refcnt;
+};
+
+static void nft_xt_put(struct nft_xt *xt)
+{
+ if (--xt->refcnt == 0) {
+ list_del(&xt->head);
+ kfree(xt);
+ }
+}
+
static int nft_compat_chain_validate_dependency(const char *tablename,
const struct nft_chain *chain)
{
@@ -260,6 +274,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
if (par.target->destroy != NULL)
par.target->destroy(&par);
+ nft_xt_put(container_of(expr->ops, struct nft_xt, ops));
module_put(target->me);
}
@@ -442,6 +457,7 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
if (par.match->destroy != NULL)
par.match->destroy(&par);
+ nft_xt_put(container_of(expr->ops, struct nft_xt, ops));
module_put(match->me);
}
@@ -612,11 +628,6 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
static LIST_HEAD(nft_match_list);
-struct nft_xt {
- struct list_head head;
- struct nft_expr_ops ops;
-};
-
static struct nft_expr_type nft_match_type;
static bool nft_match_cmp(const struct xt_match *match,
@@ -634,6 +645,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
struct xt_match *match;
char *mt_name;
u32 rev, family;
+ int err;
if (tb[NFTA_MATCH_NAME] == NULL ||
tb[NFTA_MATCH_REV] == NULL ||
@@ -652,6 +664,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
if (!try_module_get(match->me))
return ERR_PTR(-ENOENT);
+ nft_match->refcnt++;
return &nft_match->ops;
}
}
@@ -660,14 +673,19 @@ nft_match_select_ops(const struct nft_ctx *ctx,
if (IS_ERR(match))
return ERR_PTR(-ENOENT);
- if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO]))
- return ERR_PTR(-EINVAL);
+ if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) {
+ err = -EINVAL;
+ goto err;
+ }
/* This is the first time we use this match, allocate operations */
nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
- if (nft_match == NULL)
- return ERR_PTR(-ENOMEM);
+ if (nft_match == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+ nft_match->refcnt = 1;
nft_match->ops.type = &nft_match_type;
nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
nft_match->ops.eval = nft_match_eval;
@@ -680,14 +698,9 @@ nft_match_select_ops(const struct nft_ctx *ctx,
list_add(&nft_match->head, &nft_match_list);
return &nft_match->ops;
-}
-
-static void nft_match_release(void)
-{
- struct nft_xt *nft_match, *tmp;
-
- list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head)
- kfree(nft_match);
+err:
+ module_put(match->me);
+ return ERR_PTR(err);
}
static struct nft_expr_type nft_match_type __read_mostly = {
@@ -717,6 +730,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
struct xt_target *target;
char *tg_name;
u32 rev, family;
+ int err;
if (tb[NFTA_TARGET_NAME] == NULL ||
tb[NFTA_TARGET_REV] == NULL ||
@@ -735,6 +749,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
if (!try_module_get(target->me))
return ERR_PTR(-ENOENT);
+ nft_target->refcnt++;
return &nft_target->ops;
}
}
@@ -743,14 +758,19 @@ nft_target_select_ops(const struct nft_ctx *ctx,
if (IS_ERR(target))
return ERR_PTR(-ENOENT);
- if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO]))
- return ERR_PTR(-EINVAL);
+ if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) {
+ err = -EINVAL;
+ goto err;
+ }
/* This is the first time we use this target, allocate operations */
nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
- if (nft_target == NULL)
- return ERR_PTR(-ENOMEM);
+ if (nft_target == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+ nft_target->refcnt = 1;
nft_target->ops.type = &nft_target_type;
nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
nft_target->ops.init = nft_target_init;
@@ -767,14 +787,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
list_add(&nft_target->head, &nft_target_list);
return &nft_target->ops;
-}
-
-static void nft_target_release(void)
-{
- struct nft_xt *nft_target, *tmp;
-
- list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head)
- kfree(nft_target);
+err:
+ module_put(target->me);
+ return ERR_PTR(err);
}
static struct nft_expr_type nft_target_type __read_mostly = {
@@ -819,8 +834,6 @@ static void __exit nft_compat_module_exit(void)
nfnetlink_subsys_unregister(&nfnl_compat_subsys);
nft_unregister_expr(&nft_target_type);
nft_unregister_expr(&nft_match_type);
- nft_match_release();
- nft_target_release();
}
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 81fbb4507..51e180f2a 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -109,18 +109,11 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
#ifdef CONFIG_NF_CONNTRACK_LABELS
case NFT_CT_LABELS: {
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
- unsigned int size;
- if (!labels) {
+ if (labels)
+ memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE);
+ else
memset(dest, 0, NF_CT_LABELS_MAX_SIZE);
- return;
- }
-
- size = labels->words * sizeof(long);
- memcpy(dest, labels->bits, size);
- if (size < NF_CT_LABELS_MAX_SIZE)
- memset(((char *) dest) + size, 0,
- NF_CT_LABELS_MAX_SIZE - size);
return;
}
#endif
@@ -351,6 +344,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
+ if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS)
+ nf_ct_set_acct(ctx->net, true);
+
return 0;
}
@@ -359,6 +355,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_ct *priv = nft_expr_priv(expr);
+ bool label_got = false;
unsigned int len;
int err;
@@ -377,6 +374,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
if (err)
return err;
+ label_got = true;
break;
#endif
default:
@@ -386,17 +384,28 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
err = nft_validate_register_load(priv->sreg, len);
if (err < 0)
- return err;
+ goto err1;
err = nft_ct_l3proto_try_module_get(ctx->afi->family);
if (err < 0)
- return err;
+ goto err1;
return 0;
+
+err1:
+ if (label_got)
+ nf_connlabels_put(ctx->net);
+ return err;
+}
+
+static void nft_ct_get_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ nft_ct_l3proto_module_put(ctx->afi->family);
}
-static void nft_ct_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_ct_set_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
struct nft_ct *priv = nft_expr_priv(expr);
@@ -468,7 +477,7 @@ static const struct nft_expr_ops nft_ct_get_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
.eval = nft_ct_get_eval,
.init = nft_ct_get_init,
- .destroy = nft_ct_destroy,
+ .destroy = nft_ct_get_destroy,
.dump = nft_ct_get_dump,
};
@@ -477,7 +486,7 @@ static const struct nft_expr_ops nft_ct_set_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
.eval = nft_ct_set_eval,
.init = nft_ct_set_init,
- .destroy = nft_ct_destroy,
+ .destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
};
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 78d4914fb..0af26699b 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -103,6 +103,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_dynset *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
struct nft_set *set;
u64 timeout;
int err;
@@ -112,11 +113,13 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
tb[NFTA_DYNSET_SREG_KEY] == NULL)
return -EINVAL;
- set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]);
+ set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME],
+ genmask);
if (IS_ERR(set)) {
if (tb[NFTA_DYNSET_SET_ID])
set = nf_tables_set_lookup_byid(ctx->net,
- tb[NFTA_DYNSET_SET_ID]);
+ tb[NFTA_DYNSET_SET_ID],
+ genmask);
if (IS_ERR(set))
return PTR_ERR(set);
}
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index ba7aed13e..82c264e40 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -59,6 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 offset, len;
if (tb[NFTA_EXTHDR_DREG] == NULL ||
tb[NFTA_EXTHDR_TYPE] == NULL ||
@@ -66,9 +67,15 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
tb[NFTA_EXTHDR_LEN] == NULL)
return -EINVAL;
+ offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
+ len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+
+ if (offset > U8_MAX || len > U8_MAX)
+ return -ERANGE;
+
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
- priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
- priv->len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+ priv->offset = offset;
+ priv->len = len;
priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
return nft_validate_register_store(ctx, priv->dreg, NULL,
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index f39c53a15..564fa7929 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -71,13 +71,13 @@ static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
return 0;
}
-static bool nft_hash_lookup(const struct nft_set *set, const u32 *key,
- const struct nft_set_ext **ext)
+static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
struct nft_hash *priv = nft_set_priv(set);
const struct nft_hash_elem *he;
struct nft_hash_cmp_arg arg = {
- .genmask = nft_genmask_cur(read_pnet(&set->pnet)),
+ .genmask = nft_genmask_cur(net),
.set = set,
.key = key,
};
@@ -125,13 +125,13 @@ err1:
return false;
}
-static int nft_hash_insert(const struct nft_set *set,
+static int nft_hash_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he = elem->priv;
struct nft_hash_cmp_arg arg = {
- .genmask = nft_genmask_next(read_pnet(&set->pnet)),
+ .genmask = nft_genmask_next(net),
.set = set,
.key = elem->key.val.data,
};
@@ -140,22 +140,23 @@ static int nft_hash_insert(const struct nft_set *set,
nft_hash_params);
}
-static void nft_hash_activate(const struct nft_set *set,
+static void nft_hash_activate(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_hash_elem *he = elem->priv;
- nft_set_elem_change_active(set, &he->ext);
+ nft_set_elem_change_active(net, set, &he->ext);
nft_set_elem_clear_busy(&he->ext);
}
-static void *nft_hash_deactivate(const struct nft_set *set,
+static void *nft_hash_deactivate(const struct net *net,
+ const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he;
struct nft_hash_cmp_arg arg = {
- .genmask = nft_genmask_next(read_pnet(&set->pnet)),
+ .genmask = nft_genmask_next(net),
.set = set,
.key = elem->key.val.data,
};
@@ -163,8 +164,9 @@ static void *nft_hash_deactivate(const struct nft_set *set,
rcu_read_lock();
he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
if (he != NULL) {
- if (!nft_set_elem_mark_busy(&he->ext))
- nft_set_elem_change_active(set, &he->ext);
+ if (!nft_set_elem_mark_busy(&he->ext) ||
+ !nft_is_active(net, &he->ext))
+ nft_set_elem_change_active(net, set, &he->ext);
else
he = NULL;
}
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 319c22b4b..24a73bb26 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -52,7 +52,14 @@ static int nft_log_init(const struct nft_ctx *ctx,
struct nft_log *priv = nft_expr_priv(expr);
struct nf_loginfo *li = &priv->loginfo;
const struct nlattr *nla;
- int ret;
+ int err;
+
+ li->type = NF_LOG_TYPE_LOG;
+ if (tb[NFTA_LOG_LEVEL] != NULL &&
+ tb[NFTA_LOG_GROUP] != NULL)
+ return -EINVAL;
+ if (tb[NFTA_LOG_GROUP] != NULL)
+ li->type = NF_LOG_TYPE_ULOG;
nla = tb[NFTA_LOG_PREFIX];
if (nla != NULL) {
@@ -64,13 +71,6 @@ static int nft_log_init(const struct nft_ctx *ctx,
priv->prefix = (char *)nft_log_null_prefix;
}
- li->type = NF_LOG_TYPE_LOG;
- if (tb[NFTA_LOG_LEVEL] != NULL &&
- tb[NFTA_LOG_GROUP] != NULL)
- return -EINVAL;
- if (tb[NFTA_LOG_GROUP] != NULL)
- li->type = NF_LOG_TYPE_ULOG;
-
switch (li->type) {
case NF_LOG_TYPE_LOG:
if (tb[NFTA_LOG_LEVEL] != NULL) {
@@ -79,6 +79,11 @@ static int nft_log_init(const struct nft_ctx *ctx,
} else {
li->u.log.level = LOGLEVEL_WARNING;
}
+ if (li->u.log.level > LOGLEVEL_DEBUG) {
+ err = -EINVAL;
+ goto err1;
+ }
+
if (tb[NFTA_LOG_FLAGS] != NULL) {
li->u.log.logflags =
ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS]));
@@ -87,6 +92,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
case NF_LOG_TYPE_ULOG:
li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP]));
if (tb[NFTA_LOG_SNAPLEN] != NULL) {
+ li->u.ulog.flags |= NF_LOG_F_COPY_LEN;
li->u.ulog.copy_len =
ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN]));
}
@@ -97,20 +103,16 @@ static int nft_log_init(const struct nft_ctx *ctx,
break;
}
- if (ctx->afi->family == NFPROTO_INET) {
- ret = nf_logger_find_get(NFPROTO_IPV4, li->type);
- if (ret < 0)
- return ret;
+ err = nf_logger_find_get(ctx->afi->family, li->type);
+ if (err < 0)
+ goto err1;
- ret = nf_logger_find_get(NFPROTO_IPV6, li->type);
- if (ret < 0) {
- nf_logger_put(NFPROTO_IPV4, li->type);
- return ret;
- }
- return 0;
- }
+ return 0;
- return nf_logger_find_get(ctx->afi->family, li->type);
+err1:
+ if (priv->prefix != nft_log_null_prefix)
+ kfree(priv->prefix);
+ return err;
}
static void nft_log_destroy(const struct nft_ctx *ctx,
@@ -122,12 +124,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
if (priv->prefix != nft_log_null_prefix)
kfree(priv->prefix);
- if (ctx->afi->family == NFPROTO_INET) {
- nf_logger_put(NFPROTO_IPV4, li->type);
- nf_logger_put(NFPROTO_IPV6, li->type);
- } else {
- nf_logger_put(ctx->afi->family, li->type);
- }
+ nf_logger_put(ctx->afi->family, li->type);
}
static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -153,7 +150,7 @@ static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group)))
goto nla_put_failure;
- if (li->u.ulog.copy_len) {
+ if (li->u.ulog.flags & NF_LOG_F_COPY_LEN) {
if (nla_put_be32(skb, NFTA_LOG_SNAPLEN,
htonl(li->u.ulog.copy_len)))
goto nla_put_failure;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index b3c31ef80..e164325d1 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -22,6 +22,7 @@ struct nft_lookup {
struct nft_set *set;
enum nft_registers sreg:8;
enum nft_registers dreg:8;
+ bool invert;
struct nft_set_binding binding;
};
@@ -32,14 +33,20 @@ static void nft_lookup_eval(const struct nft_expr *expr,
const struct nft_lookup *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
const struct nft_set_ext *ext;
+ bool found;
- if (set->ops->lookup(set, &regs->data[priv->sreg], &ext)) {
- if (set->flags & NFT_SET_MAP)
- nft_data_copy(&regs->data[priv->dreg],
- nft_set_ext_data(ext), set->dlen);
+ found = set->ops->lookup(pkt->net, set, &regs->data[priv->sreg], &ext) ^
+ priv->invert;
+
+ if (!found) {
+ regs->verdict.code = NFT_BREAK;
return;
}
- regs->verdict.code = NFT_BREAK;
+
+ if (found && set->flags & NFT_SET_MAP)
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), set->dlen);
+
}
static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
@@ -47,6 +54,7 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
[NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 },
[NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
[NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
+ [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 },
};
static int nft_lookup_init(const struct nft_ctx *ctx,
@@ -54,18 +62,21 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_lookup *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
struct nft_set *set;
+ u32 flags;
int err;
if (tb[NFTA_LOOKUP_SET] == NULL ||
tb[NFTA_LOOKUP_SREG] == NULL)
return -EINVAL;
- set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]);
+ set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET], genmask);
if (IS_ERR(set)) {
if (tb[NFTA_LOOKUP_SET_ID]) {
set = nf_tables_set_lookup_byid(ctx->net,
- tb[NFTA_LOOKUP_SET_ID]);
+ tb[NFTA_LOOKUP_SET_ID],
+ genmask);
}
if (IS_ERR(set))
return PTR_ERR(set);
@@ -79,7 +90,22 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
+ if (tb[NFTA_LOOKUP_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS]));
+
+ if (flags & ~NFT_LOOKUP_F_INV)
+ return -EINVAL;
+
+ if (flags & NFT_LOOKUP_F_INV) {
+ if (set->flags & NFT_SET_MAP)
+ return -EINVAL;
+ priv->invert = true;
+ }
+ }
+
if (tb[NFTA_LOOKUP_DREG] != NULL) {
+ if (priv->invert)
+ return -EINVAL;
if (!(set->flags & NFT_SET_MAP))
return -EINVAL;
@@ -112,6 +138,7 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx,
static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
+ u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0;
if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name))
goto nla_put_failure;
@@ -120,6 +147,8 @@ static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (priv->set->flags & NFT_SET_MAP)
if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg))
goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index f4bad9dc1..8a6bc7630 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -199,13 +199,6 @@ err:
}
EXPORT_SYMBOL_GPL(nft_meta_get_eval);
-/* don't change or set _LOOPBACK, _USER, etc. */
-static bool pkt_type_ok(u32 p)
-{
- return p == PACKET_HOST || p == PACKET_BROADCAST ||
- p == PACKET_MULTICAST || p == PACKET_OTHERHOST;
-}
-
void nft_meta_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -223,7 +216,7 @@ void nft_meta_set_eval(const struct nft_expr *expr,
break;
case NFT_META_PKTTYPE:
if (skb->pkt_type != value &&
- pkt_type_ok(value) && pkt_type_ok(skb->pkt_type))
+ skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type))
skb->pkt_type = value;
break;
case NFT_META_NFTRACE:
@@ -298,10 +291,16 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_meta_get_init);
-static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
{
+ struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
+ if (priv->key != NFT_META_PKTTYPE)
+ return 0;
+
switch (ctx->afi->family) {
case NFPROTO_BRIDGE:
hooks = 1 << NF_BR_PRE_ROUTING;
@@ -315,6 +314,7 @@ static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
return nft_chain_validate_hooks(ctx->chain, hooks);
}
+EXPORT_SYMBOL_GPL(nft_meta_set_validate);
int nft_meta_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
@@ -334,15 +334,16 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
len = sizeof(u8);
break;
case NFT_META_PKTTYPE:
- err = nft_meta_set_init_pkttype(ctx);
- if (err)
- return err;
len = sizeof(u8);
break;
default:
return -EOPNOTSUPP;
}
+ err = nft_meta_set_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
+
priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
err = nft_validate_register_load(priv->sreg, len);
if (err < 0)
@@ -414,6 +415,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
.init = nft_meta_set_init,
.destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
+ .validate = nft_meta_set_validate,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index 7201d57b5..ffe9ae062 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -41,13 +41,13 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
}
-static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
- const struct nft_set_ext **ext)
+static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
const struct nft_rbtree *priv = nft_set_priv(set);
const struct nft_rbtree_elem *rbe, *interval = NULL;
+ u8 genmask = nft_genmask_cur(net);
const struct rb_node *parent;
- u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
const void *this;
int d;
@@ -70,7 +70,6 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
} else if (d > 0)
parent = parent->rb_right;
else {
-found:
if (!nft_set_elem_active(&rbe->ext, genmask)) {
parent = parent->rb_left;
continue;
@@ -84,22 +83,25 @@ found:
}
}
- if (set->flags & NFT_SET_INTERVAL && interval != NULL) {
- rbe = interval;
- goto found;
+ if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
+ nft_set_elem_active(&interval->ext, genmask) &&
+ !nft_rbtree_interval_end(interval)) {
+ spin_unlock_bh(&nft_rbtree_lock);
+ *ext = &interval->ext;
+ return true;
}
out:
spin_unlock_bh(&nft_rbtree_lock);
return false;
}
-static int __nft_rbtree_insert(const struct nft_set *set,
+static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new)
{
struct nft_rbtree *priv = nft_set_priv(set);
+ u8 genmask = nft_genmask_next(net);
struct nft_rbtree_elem *rbe;
struct rb_node *parent, **p;
- u8 genmask = nft_genmask_next(read_pnet(&set->pnet));
int d;
parent = NULL;
@@ -132,14 +134,14 @@ static int __nft_rbtree_insert(const struct nft_set *set,
return 0;
}
-static int nft_rbtree_insert(const struct nft_set *set,
+static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_rbtree_elem *rbe = elem->priv;
int err;
spin_lock_bh(&nft_rbtree_lock);
- err = __nft_rbtree_insert(set, rbe);
+ err = __nft_rbtree_insert(net, set, rbe);
spin_unlock_bh(&nft_rbtree_lock);
return err;
@@ -156,21 +158,23 @@ static void nft_rbtree_remove(const struct nft_set *set,
spin_unlock_bh(&nft_rbtree_lock);
}
-static void nft_rbtree_activate(const struct nft_set *set,
+static void nft_rbtree_activate(const struct net *net,
+ const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_rbtree_elem *rbe = elem->priv;
- nft_set_elem_change_active(set, &rbe->ext);
+ nft_set_elem_change_active(net, set, &rbe->ext);
}
-static void *nft_rbtree_deactivate(const struct nft_set *set,
+static void *nft_rbtree_deactivate(const struct net *net,
+ const struct nft_set *set,
const struct nft_set_elem *elem)
{
const struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
struct nft_rbtree_elem *rbe, *this = elem->priv;
- u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
+ u8 genmask = nft_genmask_next(net);
int d;
while (parent != NULL) {
@@ -196,7 +200,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
parent = parent->rb_right;
continue;
}
- nft_set_elem_change_active(set, &rbe->ext);
+ nft_set_elem_change_active(net, set, &rbe->ext);
return rbe;
}
}
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 0522fc9bf..c64de3f73 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -26,11 +26,27 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
};
EXPORT_SYMBOL_GPL(nft_reject_policy);
+int nft_reject_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT));
+}
+EXPORT_SYMBOL_GPL(nft_reject_validate);
+
int nft_reject_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_reject *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_reject_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 759ca5248..e79d9ca2f 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -66,7 +66,11 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_reject *priv = nft_expr_priv(expr);
- int icmp_code;
+ int icmp_code, err;
+
+ err = nft_reject_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
@@ -124,6 +128,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
.eval = nft_reject_inet_eval,
.init = nft_reject_inet_init,
.dump = nft_reject_inet_dump,
+ .validate = nft_reject_validate,
};
static struct nft_expr_type nft_reject_inet_type __read_mostly = {
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 2675d580c..e0aa7c1d0 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -702,6 +702,56 @@ int xt_check_entry_offsets(const void *base,
}
EXPORT_SYMBOL(xt_check_entry_offsets);
+/**
+ * xt_alloc_entry_offsets - allocate array to store rule head offsets
+ *
+ * @size: number of entries
+ *
+ * Return: NULL or kmalloc'd or vmalloc'd array
+ */
+unsigned int *xt_alloc_entry_offsets(unsigned int size)
+{
+ unsigned int *off;
+
+ off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN);
+
+ if (off)
+ return off;
+
+ if (size < (SIZE_MAX / sizeof(unsigned int)))
+ off = vmalloc(size * sizeof(unsigned int));
+
+ return off;
+}
+EXPORT_SYMBOL(xt_alloc_entry_offsets);
+
+/**
+ * xt_find_jump_offset - check if target is a valid jump offset
+ *
+ * @offsets: array containing all valid rule start offsets of a rule blob
+ * @target: the jump target to search for
+ * @size: entries in @offset
+ */
+bool xt_find_jump_offset(const unsigned int *offsets,
+ unsigned int target, unsigned int size)
+{
+ int m, low = 0, hi = size;
+
+ while (hi > low) {
+ m = (low + hi) / 2u;
+
+ if (offsets[m] > target)
+ hi = m;
+ else if (offsets[m] < target)
+ low = m + 1;
+ else
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(xt_find_jump_offset);
+
int xt_check_target(struct xt_tgchk_param *par,
unsigned int size, u_int8_t proto, bool inv_proto)
{
@@ -1460,6 +1510,9 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
uint8_t hooknum;
struct nf_hook_ops *ops;
+ if (!num_hooks)
+ return ERR_PTR(-EINVAL);
+
ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
if (ops == NULL)
return ERR_PTR(-ENOMEM);
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index a1fa2c800..018eed7e1 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -33,6 +33,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
li.u.ulog.group = info->group;
li.u.ulog.qthreshold = info->threshold;
+ if (info->flags & XT_NFLOG_F_COPY_LEN)
+ li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
+
nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
par->out, &li, info->prefix);
return XT_CONTINUE;
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 604df6fae..515131f9e 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -137,7 +137,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.ewma_log = info->ewma_log;
ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
- &est->lock, &cfg.opt);
+ &est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 7f4414d26..663c4c3c9 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
daddr, dport,
in->ifindex);
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
/* NOTE: we return listeners even if bound to
* 0.0.0.0, those are filtered out in
* xt_socket, since xt_TPROXY needs 0 bound
@@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
daddr, ntohs(dport),
in->ifindex);
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
/* NOTE: we return listeners even if bound to
* 0.0.0.0, those are filtered out in
* xt_socket, since xt_TPROXY needs 0 bound
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index df48967af..858d189a1 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -4,12 +4,23 @@
#include <linux/skbuff.h>
#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
MODULE_DESCRIPTION("Xtables: packet flow tracing");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_TRACE");
MODULE_ALIAS("ip6t_TRACE");
+static int trace_tg_check(const struct xt_tgchk_param *par)
+{
+ return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+}
+
+static void trace_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_logger_put(par->family, NF_LOG_TYPE_LOG);
+}
+
static unsigned int
trace_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -18,12 +29,14 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par)
}
static struct xt_target trace_tg_reg __read_mostly = {
- .name = "TRACE",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .table = "raw",
- .target = trace_tg,
- .me = THIS_MODULE,
+ .name = "TRACE",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .table = "raw",
+ .target = trace_tg,
+ .checkentry = trace_tg_check,
+ .destroy = trace_tg_destroy,
+ .me = THIS_MODULE,
};
static int __init trace_tg_init(void)
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index a79af2555..03d66f1c5 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <linux/netfilter/x_tables.h>
@@ -18,21 +19,12 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels");
MODULE_ALIAS("ipt_connlabel");
MODULE_ALIAS("ip6t_connlabel");
-static bool connlabel_match(const struct nf_conn *ct, u16 bit)
-{
- struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-
- if (!labels)
- return false;
-
- return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits);
-}
-
static bool
connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_connlabel_mtinfo *info = par->matchinfo;
enum ip_conntrack_info ctinfo;
+ struct nf_conn_labels *labels;
struct nf_conn *ct;
bool invert = info->options & XT_CONNLABEL_OP_INVERT;
@@ -40,10 +32,21 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (ct == NULL || nf_ct_is_untracked(ct))
return invert;
- if (info->options & XT_CONNLABEL_OP_SET)
- return (nf_connlabel_set(ct, info->bit) == 0) ^ invert;
+ labels = nf_ct_labels_find(ct);
+ if (!labels)
+ return invert;
+
+ if (test_bit(info->bit, labels->bits))
+ return !invert;
+
+ if (info->options & XT_CONNLABEL_OP_SET) {
+ if (!test_and_set_bit(info->bit, labels->bits))
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+ return !invert;
+ }
- return connlabel_match(ct, info->bit) ^ invert;
+ return invert;
}
static int connlabel_mt_check(const struct xt_mtchk_param *par)
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 3048a7e3a..cf3275938 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -26,7 +26,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
nfnl_acct_update(skb, info->nfacct);
- overquota = nfnl_acct_overquota(skb, info->nfacct);
+ overquota = nfnl_acct_overquota(par->net, skb, info->nfacct);
return overquota == NFACCT_UNDERQUOTA ? false : true;
}
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 1302b475a..a20e731b5 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -21,11 +21,39 @@
static int owner_check(const struct xt_mtchk_param *par)
{
struct xt_owner_match_info *info = par->matchinfo;
+ struct net *net = par->net;
- /* For now only allow adding matches from the initial user namespace */
+ /* Only allow the common case where the userns of the writer
+ * matches the userns of the network namespace.
+ */
if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) &&
- (current_user_ns() != &init_user_ns))
+ (current_user_ns() != net->user_ns))
return -EINVAL;
+
+ /* Ensure the uids are valid */
+ if (info->match & XT_OWNER_UID) {
+ kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
+ kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
+
+ if (!uid_valid(uid_min) || !uid_valid(uid_max) ||
+ (info->uid_max < info->uid_min) ||
+ uid_lt(uid_max, uid_min)) {
+ return -EINVAL;
+ }
+ }
+
+ /* Ensure the gids are valid */
+ if (info->match & XT_OWNER_GID) {
+ kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
+ kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
+
+ if (!gid_valid(gid_min) || !gid_valid(gid_max) ||
+ (info->gid_max < info->gid_min) ||
+ gid_lt(gid_max, gid_min)) {
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -35,6 +63,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_owner_match_info *info = par->matchinfo;
const struct file *filp;
struct sock *sk = skb_to_full_sk(skb);
+ struct net *net = par->net;
if (sk == NULL || sk->sk_socket == NULL)
return (info->match ^ info->invert) == 0;
@@ -51,8 +80,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
if (info->match & XT_OWNER_UID) {
- kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
- kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
+ kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
+ kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
uid_lte(filp->f_cred->fsuid, uid_max)) ^
!(info->invert & XT_OWNER_UID))
@@ -60,8 +89,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
if (info->match & XT_OWNER_GID) {
- kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
- kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
+ kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
+ kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
gid_lte(filp->f_cred->fsgid, gid_max)) ^
!(info->invert & XT_OWNER_GID))
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 1caaccbc3..e5f18988a 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -102,14 +102,14 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
info->bitmask & ~XT_PHYSDEV_OP_MASK)
return -EINVAL;
- if (info->bitmask & XT_PHYSDEV_OP_OUT &&
+ if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) &&
(!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
info->invert & XT_PHYSDEV_OP_BRIDGED) &&
par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
- pr_info("using --physdev-out in the OUTPUT, FORWARD and "
- "POSTROUTING chains for non-bridged traffic is not "
- "supported anymore.\n");
+ pr_info("using --physdev-out and --physdev-is-out are only"
+ "supported in the FORWARD and POSTROUTING chains with"
+ "bridged traffic.\n");
if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
return -EINVAL;
}
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index c14d4645d..ade024c90 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -83,8 +83,6 @@ static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par)
return false;
}
-#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg)))
-
th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
if (th == NULL) {
/* We've been asked to examine this packet, and we
@@ -102,9 +100,8 @@ static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par)
ntohs(th->dest),
!!(tcpinfo->invflags & XT_TCP_INV_DSTPT)))
return false;
- if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
- == tcpinfo->flg_cmp,
- XT_TCP_INV_FLAGS))
+ if (!NF_INVF(tcpinfo, XT_TCP_INV_FLAGS,
+ (((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp))
return false;
if (tcpinfo->option) {
if (th->doff * 4 < sizeof(_tcph)) {
diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig
index 56958c85f..d9eaa30ff 100644
--- a/net/netlabel/Kconfig
+++ b/net/netlabel/Kconfig
@@ -5,6 +5,7 @@
config NETLABEL
bool "NetLabel subsystem support"
depends on SECURITY
+ select CRC_CCITT if IPV6
default n
---help---
NetLabel provides support for explicit network packet labeling
diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile
index d2732fc95..d341ede0d 100644
--- a/net/netlabel/Makefile
+++ b/net/netlabel/Makefile
@@ -12,4 +12,4 @@ obj-y += netlabel_mgmt.o
# protocol modules
obj-y += netlabel_unlabeled.o
obj-y += netlabel_cipso_v4.o
-
+obj-$(subst m,y,$(CONFIG_IPV6)) += netlabel_calipso.o
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
new file mode 100644
index 000000000..2ec93c5e7
--- /dev/null
+++ b/net/netlabel/netlabel_calipso.c
@@ -0,0 +1,740 @@
+/*
+ * NetLabel CALIPSO/IPv6 Support
+ *
+ * This file defines the CALIPSO/IPv6 functions for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and CALIPSO.
+ *
+ * Authors: Paul Moore <paul@paul-moore.com>
+ * Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+#include <net/calipso.h>
+#include <linux/atomic.h>
+
+#include "netlabel_user.h"
+#include "netlabel_calipso.h"
+#include "netlabel_mgmt.h"
+#include "netlabel_domainhash.h"
+
+/* Argument struct for calipso_doi_walk() */
+struct netlbl_calipso_doiwalk_arg {
+ struct netlink_callback *nl_cb;
+ struct sk_buff *skb;
+ u32 seq;
+};
+
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+ struct netlbl_audit *audit_info;
+ u32 doi;
+};
+
+/* NetLabel Generic NETLINK CALIPSO family */
+static struct genl_family netlbl_calipso_gnl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_CALIPSO_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_CALIPSO_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = {
+ [NLBL_CALIPSO_A_DOI] = { .type = NLA_U32 },
+ [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 },
+};
+
+/* NetLabel Command Handlers
+ */
+/**
+ * netlbl_calipso_add_pass - Adds a CALIPSO pass DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CALIPSO_MAP_PASS DOI definition based on the given ADD message
+ * and add it to the CALIPSO engine. Return zero on success and non-zero on
+ * error.
+ *
+ */
+static int netlbl_calipso_add_pass(struct genl_info *info,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct calipso_doi *doi_def = NULL;
+
+ doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+ if (!doi_def)
+ return -ENOMEM;
+ doi_def->type = CALIPSO_MAP_PASS;
+ doi_def->doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+ ret_val = calipso_doi_add(doi_def, audit_info);
+ if (ret_val != 0)
+ calipso_doi_free(doi_def);
+
+ return ret_val;
+}
+
+/**
+ * netlbl_calipso_add - Handle an ADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Create a new DOI definition based on the given ADD message and add it to the
+ * CALIPSO engine. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info)
+
+{
+ int ret_val = -EINVAL;
+ struct netlbl_audit audit_info;
+
+ if (!info->attrs[NLBL_CALIPSO_A_DOI] ||
+ !info->attrs[NLBL_CALIPSO_A_MTYPE])
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+ switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) {
+ case CALIPSO_MAP_PASS:
+ ret_val = netlbl_calipso_add_pass(info, &audit_info);
+ break;
+ }
+ if (ret_val == 0)
+ atomic_inc(&netlabel_mgmt_protocount);
+
+ return ret_val;
+}
+
+/**
+ * netlbl_calipso_list - Handle a LIST message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LIST message and respond accordingly.
+ * Returns zero on success and negative values on error.
+ *
+ */
+static int netlbl_calipso_list(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val;
+ struct sk_buff *ans_skb = NULL;
+ void *data;
+ u32 doi;
+ struct calipso_doi *doi_def;
+
+ if (!info->attrs[NLBL_CALIPSO_A_DOI]) {
+ ret_val = -EINVAL;
+ goto list_failure;
+ }
+
+ doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+
+ doi_def = calipso_doi_getdef(doi);
+ if (!doi_def) {
+ ret_val = -EINVAL;
+ goto list_failure;
+ }
+
+ ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ans_skb) {
+ ret_val = -ENOMEM;
+ goto list_failure_put;
+ }
+ data = genlmsg_put_reply(ans_skb, info, &netlbl_calipso_gnl_family,
+ 0, NLBL_CALIPSO_C_LIST);
+ if (!data) {
+ ret_val = -ENOMEM;
+ goto list_failure_put;
+ }
+
+ ret_val = nla_put_u32(ans_skb, NLBL_CALIPSO_A_MTYPE, doi_def->type);
+ if (ret_val != 0)
+ goto list_failure_put;
+
+ calipso_doi_putdef(doi_def);
+
+ genlmsg_end(ans_skb, data);
+ return genlmsg_reply(ans_skb, info);
+
+list_failure_put:
+ calipso_doi_putdef(doi_def);
+list_failure:
+ kfree_skb(ans_skb);
+ return ret_val;
+}
+
+/**
+ * netlbl_calipso_listall_cb - calipso_doi_walk() callback for LISTALL
+ * @doi_def: the CALIPSO DOI definition
+ * @arg: the netlbl_calipso_doiwalk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * calipso_doi_walk() function for use in generating a response for a LISTALL
+ * message. Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_calipso_listall_cb(struct calipso_doi *doi_def, void *arg)
+{
+ int ret_val = -ENOMEM;
+ struct netlbl_calipso_doiwalk_arg *cb_arg = arg;
+ void *data;
+
+ data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
+ cb_arg->seq, &netlbl_calipso_gnl_family,
+ NLM_F_MULTI, NLBL_CALIPSO_C_LISTALL);
+ if (!data)
+ goto listall_cb_failure;
+
+ ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_DOI, doi_def->doi);
+ if (ret_val != 0)
+ goto listall_cb_failure;
+ ret_val = nla_put_u32(cb_arg->skb,
+ NLBL_CALIPSO_A_MTYPE,
+ doi_def->type);
+ if (ret_val != 0)
+ goto listall_cb_failure;
+
+ genlmsg_end(cb_arg->skb, data);
+ return 0;
+
+listall_cb_failure:
+ genlmsg_cancel(cb_arg->skb, data);
+ return ret_val;
+}
+
+/**
+ * netlbl_calipso_listall - Handle a LISTALL message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated LISTALL message and respond accordingly. Returns
+ * zero on success and negative values on error.
+ *
+ */
+static int netlbl_calipso_listall(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netlbl_calipso_doiwalk_arg cb_arg;
+ u32 doi_skip = cb->args[0];
+
+ cb_arg.nl_cb = cb;
+ cb_arg.skb = skb;
+ cb_arg.seq = cb->nlh->nlmsg_seq;
+
+ calipso_doi_walk(&doi_skip, netlbl_calipso_listall_cb, &cb_arg);
+
+ cb->args[0] = doi_skip;
+ return skb->len;
+}
+
+/**
+ * netlbl_calipso_remove_cb - netlbl_calipso_remove() callback for REMOVE
+ * @entry: LSM domain mapping entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is intended for use by netlbl_calipso_remove() as the callback
+ * for the netlbl_domhsh_walk() function; it removes LSM domain map entries
+ * which are associated with the CALIPSO DOI specified in @arg. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_remove_cb(struct netlbl_dom_map *entry, void *arg)
+{
+ struct netlbl_domhsh_walk_arg *cb_arg = arg;
+
+ if (entry->def.type == NETLBL_NLTYPE_CALIPSO &&
+ entry->def.calipso->doi == cb_arg->doi)
+ return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info);
+
+ return 0;
+}
+
+/**
+ * netlbl_calipso_remove - Handle a REMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVE message and respond accordingly. Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val = -EINVAL;
+ struct netlbl_domhsh_walk_arg cb_arg;
+ struct netlbl_audit audit_info;
+ u32 skip_bkt = 0;
+ u32 skip_chain = 0;
+
+ if (!info->attrs[NLBL_CALIPSO_A_DOI])
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+ cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+ cb_arg.audit_info = &audit_info;
+ ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain,
+ netlbl_calipso_remove_cb, &cb_arg);
+ if (ret_val == 0 || ret_val == -ENOENT) {
+ ret_val = calipso_doi_remove(cb_arg.doi, &audit_info);
+ if (ret_val == 0)
+ atomic_dec(&netlabel_mgmt_protocount);
+ }
+
+ return ret_val;
+}
+
+/* NetLabel Generic NETLINK Command Definitions
+ */
+
+static const struct genl_ops netlbl_calipso_ops[] = {
+ {
+ .cmd = NLBL_CALIPSO_C_ADD,
+ .flags = GENL_ADMIN_PERM,
+ .policy = calipso_genl_policy,
+ .doit = netlbl_calipso_add,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CALIPSO_C_REMOVE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = calipso_genl_policy,
+ .doit = netlbl_calipso_remove,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CALIPSO_C_LIST,
+ .flags = 0,
+ .policy = calipso_genl_policy,
+ .doit = netlbl_calipso_list,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CALIPSO_C_LISTALL,
+ .flags = 0,
+ .policy = calipso_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_calipso_listall,
+ },
+};
+
+/* NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_calipso_genl_init - Register the CALIPSO NetLabel component
+ *
+ * Description:
+ * Register the CALIPSO packet NetLabel component with the Generic NETLINK
+ * mechanism. Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_calipso_genl_init(void)
+{
+ return genl_register_family_with_ops(&netlbl_calipso_gnl_family,
+ netlbl_calipso_ops);
+}
+
+static const struct netlbl_calipso_ops *calipso_ops;
+
+/**
+ * netlbl_calipso_ops_register - Register the CALIPSO operations
+ *
+ * Description:
+ * Register the CALIPSO packet engine operations.
+ *
+ */
+const struct netlbl_calipso_ops *
+netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops)
+{
+ return xchg(&calipso_ops, ops);
+}
+EXPORT_SYMBOL(netlbl_calipso_ops_register);
+
+static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void)
+{
+ return ACCESS_ONCE(calipso_ops);
+}
+
+/**
+ * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CALIPSO engine and calls this
+ * function to add it to the list of acceptable domains. The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see calipso.h for details). Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int calipso_doi_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->doi_add(doi_def, audit_info);
+ return ret_val;
+}
+
+/**
+ * calipso_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void calipso_doi_free(struct calipso_doi *doi_def)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->doi_free(doi_def);
+}
+
+/**
+ * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->doi_remove(doi, audit_info);
+ return ret_val;
+}
+
+/**
+ * calipso_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller. Otherwise NULL is returned. The caller must ensure that
+ * calipso_doi_putdef() is called when the caller is done.
+ *
+ */
+struct calipso_doi *calipso_doi_getdef(u32 doi)
+{
+ struct calipso_doi *ret_val = NULL;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->doi_getdef(doi);
+ return ret_val;
+}
+
+/**
+ * calipso_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from calipso_doi_getdef().
+ *
+ */
+void calipso_doi_putdef(struct calipso_doi *doi_def)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->doi_putdef(doi_def);
+}
+
+/**
+ * calipso_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return. Updates the value in @skip_cnt upon
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+int calipso_doi_walk(u32 *skip_cnt,
+ int (*callback)(struct calipso_doi *doi_def, void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->doi_walk(skip_cnt, callback, cb_arg);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CALIPSO option attached to the sock and if
+ * there is return the CALIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->sock_getattr(sk, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_setattr - Add a CALIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int calipso_sock_setattr(struct sock *sk,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->sock_setattr(sk, doi_def, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_delattr - Delete the CALIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a socket, if present.
+ *
+ */
+void calipso_sock_delattr(struct sock *sk)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->sock_delattr(sk);
+}
+
+/**
+ * calipso_req_setattr - Add a CALIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int calipso_req_setattr(struct request_sock *req,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->req_setattr(req, doi_def, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_req_delattr - Delete the CALIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a request socket, if present.
+ *
+ */
+void calipso_req_delattr(struct request_sock *req)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->req_delattr(req);
+}
+
+/**
+ * calipso_optptr - Find the CALIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer
+ * to the start of the CALIPSO option on success, NULL if one if not found.
+ *
+ */
+unsigned char *calipso_optptr(const struct sk_buff *skb)
+{
+ unsigned char *ret_val = NULL;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->skbuff_optptr(skb);
+ return ret_val;
+}
+
+/**
+ * calipso_getattr - Get the security attributes from a memory block.
+ * @calipso: the CALIPSO option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @calipso and return the security attributes in @secattr.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_getattr(const unsigned char *calipso,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->opt_getattr(calipso, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_skbuff_setattr - Set the CALIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CALIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int calipso_skbuff_setattr(struct sk_buff *skb,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->skbuff_setattr(skb, doi_def, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_skbuff_delattr - Delete any CALIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CALIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int calipso_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->skbuff_delattr(skb);
+ return ret_val;
+}
+
+/**
+ * calipso_cache_invalidate - Invalidates the current CALIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CALIPSO cache. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void calipso_cache_invalidate(void)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->cache_invalidate();
+}
+
+/**
+ * calipso_cache_add - Add an entry to the CALIPSO cache
+ * @calipso_ptr: the CALIPSO option
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CALIPSO label mapping cache.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int calipso_cache_add(const unsigned char *calipso_ptr,
+ const struct netlbl_lsm_secattr *secattr)
+
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->cache_add(calipso_ptr, secattr);
+ return ret_val;
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
new file mode 100644
index 000000000..9fd291cd0
--- /dev/null
+++ b/net/netlabel/netlabel_calipso.h
@@ -0,0 +1,151 @@
+/*
+ * NetLabel CALIPSO Support
+ *
+ * This file defines the CALIPSO functions for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Authors: Paul Moore <paul@paul-moore.com>
+ * Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_CALIPSO
+#define _NETLABEL_CALIPSO
+
+#include <net/netlabel.h>
+#include <net/calipso.h>
+
+/* The following NetLabel payloads are supported by the CALIPSO subsystem.
+ *
+ * o ADD:
+ * Sent by an application to add a new DOI mapping table.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_DOI
+ * NLBL_CALIPSO_A_MTYPE
+ *
+ * If using CALIPSO_MAP_PASS no additional attributes are required.
+ *
+ * o REMOVE:
+ * Sent by an application to remove a specific DOI mapping table from the
+ * CALIPSO system.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_DOI
+ *
+ * o LIST:
+ * Sent by an application to list the details of a DOI definition. On
+ * success the kernel should send a response using the following format.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_DOI
+ *
+ * The valid response message format depends on the type of the DOI mapping,
+ * the defined formats are shown below.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_MTYPE
+ *
+ * If using CALIPSO_MAP_PASS no additional attributes are required.
+ *
+ * o LISTALL:
+ * This message is sent by an application to list the valid DOIs on the
+ * system. When sent by an application there is no payload and the
+ * NLM_F_DUMP flag should be set. The kernel should respond with a series of
+ * the following messages.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_DOI
+ * NLBL_CALIPSO_A_MTYPE
+ *
+ */
+
+/* NetLabel CALIPSO commands */
+enum {
+ NLBL_CALIPSO_C_UNSPEC,
+ NLBL_CALIPSO_C_ADD,
+ NLBL_CALIPSO_C_REMOVE,
+ NLBL_CALIPSO_C_LIST,
+ NLBL_CALIPSO_C_LISTALL,
+ __NLBL_CALIPSO_C_MAX,
+};
+
+/* NetLabel CALIPSO attributes */
+enum {
+ NLBL_CALIPSO_A_UNSPEC,
+ NLBL_CALIPSO_A_DOI,
+ /* (NLA_U32)
+ * the DOI value */
+ NLBL_CALIPSO_A_MTYPE,
+ /* (NLA_U32)
+ * the mapping table type (defined in the calipso.h header as
+ * CALIPSO_MAP_*) */
+ __NLBL_CALIPSO_A_MAX,
+};
+
+#define NLBL_CALIPSO_A_MAX (__NLBL_CALIPSO_A_MAX - 1)
+
+/* NetLabel protocol functions */
+#if IS_ENABLED(CONFIG_IPV6)
+int netlbl_calipso_genl_init(void);
+#else
+static inline int netlbl_calipso_genl_init(void)
+{
+ return 0;
+}
+#endif
+
+int calipso_doi_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info);
+void calipso_doi_free(struct calipso_doi *doi_def);
+int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info);
+struct calipso_doi *calipso_doi_getdef(u32 doi);
+void calipso_doi_putdef(struct calipso_doi *doi_def);
+int calipso_doi_walk(u32 *skip_cnt,
+ int (*callback)(struct calipso_doi *doi_def, void *arg),
+ void *cb_arg);
+int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr);
+int calipso_sock_setattr(struct sock *sk,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr);
+void calipso_sock_delattr(struct sock *sk);
+int calipso_req_setattr(struct request_sock *req,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr);
+void calipso_req_delattr(struct request_sock *req);
+unsigned char *calipso_optptr(const struct sk_buff *skb);
+int calipso_getattr(const unsigned char *calipso,
+ struct netlbl_lsm_secattr *secattr);
+int calipso_skbuff_setattr(struct sk_buff *skb,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr);
+int calipso_skbuff_delattr(struct sk_buff *skb);
+void calipso_cache_invalidate(void);
+int calipso_cache_add(const unsigned char *calipso_ptr,
+ const struct netlbl_lsm_secattr *secattr);
+
+#endif
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index ada674222..41d0e95d1 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -37,10 +37,12 @@
#include <linux/slab.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
+#include <net/calipso.h>
#include <asm/bug.h>
#include "netlabel_mgmt.h"
#include "netlabel_addrlist.h"
+#include "netlabel_calipso.h"
#include "netlabel_domainhash.h"
#include "netlabel_user.h"
@@ -55,8 +57,9 @@ struct netlbl_domhsh_tbl {
static DEFINE_SPINLOCK(netlbl_domhsh_lock);
#define netlbl_domhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
-static struct netlbl_domhsh_tbl *netlbl_domhsh;
-static struct netlbl_dom_map *netlbl_domhsh_def;
+static struct netlbl_domhsh_tbl __rcu *netlbl_domhsh;
+static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv4;
+static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv6;
/*
* Domain Hash Table Helper Functions
@@ -126,18 +129,26 @@ static u32 netlbl_domhsh_hash(const char *key)
return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1);
}
+static bool netlbl_family_match(u16 f1, u16 f2)
+{
+ return (f1 == f2) || (f1 == AF_UNSPEC) || (f2 == AF_UNSPEC);
+}
+
/**
* netlbl_domhsh_search - Search for a domain entry
* @domain: the domain
+ * @family: the address family
*
* Description:
* Searches the domain hash table and returns a pointer to the hash table
- * entry if found, otherwise NULL is returned. The caller is responsible for
+ * entry if found, otherwise NULL is returned. @family may be %AF_UNSPEC
+ * which matches any address family entries. The caller is responsible for
* ensuring that the hash table is protected with either a RCU read lock or the
* hash table lock.
*
*/
-static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
+static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain,
+ u16 family)
{
u32 bkt;
struct list_head *bkt_list;
@@ -147,7 +158,9 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
bkt = netlbl_domhsh_hash(domain);
bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
list_for_each_entry_rcu(iter, bkt_list, list)
- if (iter->valid && strcmp(iter->domain, domain) == 0)
+ if (iter->valid &&
+ netlbl_family_match(iter->family, family) &&
+ strcmp(iter->domain, domain) == 0)
return iter;
}
@@ -157,28 +170,37 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
/**
* netlbl_domhsh_search_def - Search for a domain entry
* @domain: the domain
- * @def: return default if no match is found
+ * @family: the address family
*
* Description:
* Searches the domain hash table and returns a pointer to the hash table
* entry if an exact match is found, if an exact match is not present in the
* hash table then the default entry is returned if valid otherwise NULL is
- * returned. The caller is responsible ensuring that the hash table is
+ * returned. @family may be %AF_UNSPEC which matches any address family
+ * entries. The caller is responsible ensuring that the hash table is
* protected with either a RCU read lock or the hash table lock.
*
*/
-static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
+static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain,
+ u16 family)
{
struct netlbl_dom_map *entry;
- entry = netlbl_domhsh_search(domain);
- if (entry == NULL) {
- entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def);
- if (entry != NULL && !entry->valid)
- entry = NULL;
+ entry = netlbl_domhsh_search(domain, family);
+ if (entry != NULL)
+ return entry;
+ if (family == AF_INET || family == AF_UNSPEC) {
+ entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv4);
+ if (entry != NULL && entry->valid)
+ return entry;
+ }
+ if (family == AF_INET6 || family == AF_UNSPEC) {
+ entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv6);
+ if (entry != NULL && entry->valid)
+ return entry;
}
- return entry;
+ return NULL;
}
/**
@@ -203,6 +225,7 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
{
struct audit_buffer *audit_buf;
struct cipso_v4_doi *cipsov4 = NULL;
+ struct calipso_doi *calipso = NULL;
u32 type;
audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
@@ -221,12 +244,14 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
struct netlbl_domaddr6_map *map6;
map6 = netlbl_domhsh_addr6_entry(addr6);
type = map6->def.type;
+ calipso = map6->def.calipso;
netlbl_af6list_audit_addr(audit_buf, 0, NULL,
&addr6->addr, &addr6->mask);
#endif /* IPv6 */
} else {
type = entry->def.type;
cipsov4 = entry->def.cipso;
+ calipso = entry->def.calipso;
}
switch (type) {
case NETLBL_NLTYPE_UNLABELED:
@@ -238,6 +263,12 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
" nlbl_protocol=cipsov4 cipso_doi=%u",
cipsov4->doi);
break;
+ case NETLBL_NLTYPE_CALIPSO:
+ BUG_ON(calipso == NULL);
+ audit_log_format(audit_buf,
+ " nlbl_protocol=calipso calipso_doi=%u",
+ calipso->doi);
+ break;
}
audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
audit_log_end(audit_buf);
@@ -264,13 +295,25 @@ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry)
if (entry == NULL)
return -EINVAL;
+ if (entry->family != AF_INET && entry->family != AF_INET6 &&
+ (entry->family != AF_UNSPEC ||
+ entry->def.type != NETLBL_NLTYPE_UNLABELED))
+ return -EINVAL;
+
switch (entry->def.type) {
case NETLBL_NLTYPE_UNLABELED:
- if (entry->def.cipso != NULL || entry->def.addrsel != NULL)
+ if (entry->def.cipso != NULL || entry->def.calipso != NULL ||
+ entry->def.addrsel != NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_CIPSOV4:
- if (entry->def.cipso == NULL)
+ if (entry->family != AF_INET ||
+ entry->def.cipso == NULL)
+ return -EINVAL;
+ break;
+ case NETLBL_NLTYPE_CALIPSO:
+ if (entry->family != AF_INET6 ||
+ entry->def.calipso == NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_ADDRSELECT:
@@ -294,6 +337,12 @@ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry)
map6 = netlbl_domhsh_addr6_entry(iter6);
switch (map6->def.type) {
case NETLBL_NLTYPE_UNLABELED:
+ if (map6->def.calipso != NULL)
+ return -EINVAL;
+ break;
+ case NETLBL_NLTYPE_CALIPSO:
+ if (map6->def.calipso == NULL)
+ return -EINVAL;
break;
default:
return -EINVAL;
@@ -358,15 +407,18 @@ int __init netlbl_domhsh_init(u32 size)
*
* Description:
* Adds a new entry to the domain hash table and handles any updates to the
- * lower level protocol handler (i.e. CIPSO). Returns zero on success,
- * negative on failure.
+ * lower level protocol handler (i.e. CIPSO). @entry->family may be set to
+ * %AF_UNSPEC which will add an entry that matches all address families. This
+ * is only useful for the unlabelled type and will only succeed if there is no
+ * existing entry for any address family with the same domain. Returns zero
+ * on success, negative on failure.
*
*/
int netlbl_domhsh_add(struct netlbl_dom_map *entry,
struct netlbl_audit *audit_info)
{
int ret_val = 0;
- struct netlbl_dom_map *entry_old;
+ struct netlbl_dom_map *entry_old, *entry_b;
struct netlbl_af4list *iter4;
struct netlbl_af4list *tmp4;
#if IS_ENABLED(CONFIG_IPV6)
@@ -385,9 +437,10 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
rcu_read_lock();
spin_lock(&netlbl_domhsh_lock);
if (entry->domain != NULL)
- entry_old = netlbl_domhsh_search(entry->domain);
+ entry_old = netlbl_domhsh_search(entry->domain, entry->family);
else
- entry_old = netlbl_domhsh_search_def(entry->domain);
+ entry_old = netlbl_domhsh_search_def(entry->domain,
+ entry->family);
if (entry_old == NULL) {
entry->valid = 1;
@@ -397,7 +450,41 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
&rcu_dereference(netlbl_domhsh)->tbl[bkt]);
} else {
INIT_LIST_HEAD(&entry->list);
- rcu_assign_pointer(netlbl_domhsh_def, entry);
+ switch (entry->family) {
+ case AF_INET:
+ rcu_assign_pointer(netlbl_domhsh_def_ipv4,
+ entry);
+ break;
+ case AF_INET6:
+ rcu_assign_pointer(netlbl_domhsh_def_ipv6,
+ entry);
+ break;
+ case AF_UNSPEC:
+ if (entry->def.type !=
+ NETLBL_NLTYPE_UNLABELED) {
+ ret_val = -EINVAL;
+ goto add_return;
+ }
+ entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC);
+ if (entry_b == NULL) {
+ ret_val = -ENOMEM;
+ goto add_return;
+ }
+ entry_b->family = AF_INET6;
+ entry_b->def.type = NETLBL_NLTYPE_UNLABELED;
+ entry_b->valid = 1;
+ entry->family = AF_INET;
+ rcu_assign_pointer(netlbl_domhsh_def_ipv4,
+ entry);
+ rcu_assign_pointer(netlbl_domhsh_def_ipv6,
+ entry_b);
+ break;
+ default:
+ /* Already checked in
+ * netlbl_domhsh_validate(). */
+ ret_val = -EINVAL;
+ goto add_return;
+ }
}
if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) {
@@ -513,10 +600,12 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
spin_lock(&netlbl_domhsh_lock);
if (entry->valid) {
entry->valid = 0;
- if (entry != rcu_dereference(netlbl_domhsh_def))
- list_del_rcu(&entry->list);
+ if (entry == rcu_dereference(netlbl_domhsh_def_ipv4))
+ RCU_INIT_POINTER(netlbl_domhsh_def_ipv4, NULL);
+ else if (entry == rcu_dereference(netlbl_domhsh_def_ipv6))
+ RCU_INIT_POINTER(netlbl_domhsh_def_ipv6, NULL);
else
- RCU_INIT_POINTER(netlbl_domhsh_def, NULL);
+ list_del_rcu(&entry->list);
} else
ret_val = -ENOENT;
spin_unlock(&netlbl_domhsh_lock);
@@ -533,6 +622,10 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
if (ret_val == 0) {
struct netlbl_af4list *iter4;
struct netlbl_domaddr4_map *map4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+ struct netlbl_domaddr6_map *map6;
+#endif /* IPv6 */
switch (entry->def.type) {
case NETLBL_NLTYPE_ADDRSELECT:
@@ -541,12 +634,22 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
map4 = netlbl_domhsh_addr4_entry(iter4);
cipso_v4_doi_putdef(map4->def.cipso);
}
- /* no need to check the IPv6 list since we currently
- * support only unlabeled protocols for IPv6 */
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(iter6,
+ &entry->def.addrsel->list6) {
+ map6 = netlbl_domhsh_addr6_entry(iter6);
+ calipso_doi_putdef(map6->def.calipso);
+ }
+#endif /* IPv6 */
break;
case NETLBL_NLTYPE_CIPSOV4:
cipso_v4_doi_putdef(entry->def.cipso);
break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NETLBL_NLTYPE_CALIPSO:
+ calipso_doi_putdef(entry->def.calipso);
+ break;
+#endif /* IPv6 */
}
call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
}
@@ -583,9 +686,9 @@ int netlbl_domhsh_remove_af4(const char *domain,
rcu_read_lock();
if (domain)
- entry_map = netlbl_domhsh_search(domain);
+ entry_map = netlbl_domhsh_search(domain, AF_INET);
else
- entry_map = netlbl_domhsh_search_def(domain);
+ entry_map = netlbl_domhsh_search_def(domain, AF_INET);
if (entry_map == NULL ||
entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
goto remove_af4_failure;
@@ -622,28 +725,114 @@ remove_af4_failure:
return -ENOENT;
}
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_domhsh_remove_af6 - Removes an address selector entry
+ * @domain: the domain
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an individual address selector from a domain mapping and potentially
+ * the entire mapping if it is empty. Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int netlbl_domhsh_remove_af6(const char *domain,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+ struct netlbl_dom_map *entry_map;
+ struct netlbl_af6list *entry_addr;
+ struct netlbl_af4list *iter4;
+ struct netlbl_af6list *iter6;
+ struct netlbl_domaddr6_map *entry;
+
+ rcu_read_lock();
+
+ if (domain)
+ entry_map = netlbl_domhsh_search(domain, AF_INET6);
+ else
+ entry_map = netlbl_domhsh_search_def(domain, AF_INET6);
+ if (entry_map == NULL ||
+ entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
+ goto remove_af6_failure;
+
+ spin_lock(&netlbl_domhsh_lock);
+ entry_addr = netlbl_af6list_remove(addr, mask,
+ &entry_map->def.addrsel->list6);
+ spin_unlock(&netlbl_domhsh_lock);
+
+ if (entry_addr == NULL)
+ goto remove_af6_failure;
+ netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4)
+ goto remove_af6_single_addr;
+ netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6)
+ goto remove_af6_single_addr;
+ /* the domain mapping is empty so remove it from the mapping table */
+ netlbl_domhsh_remove_entry(entry_map, audit_info);
+
+remove_af6_single_addr:
+ rcu_read_unlock();
+ /* yick, we can't use call_rcu here because we don't have a rcu head
+ * pointer but hopefully this should be a rare case so the pause
+ * shouldn't be a problem */
+ synchronize_rcu();
+ entry = netlbl_domhsh_addr6_entry(entry_addr);
+ calipso_doi_putdef(entry->def.calipso);
+ kfree(entry);
+ return 0;
+
+remove_af6_failure:
+ rcu_read_unlock();
+ return -ENOENT;
+}
+#endif /* IPv6 */
+
/**
* netlbl_domhsh_remove - Removes an entry from the domain hash table
* @domain: the domain to remove
+ * @family: address family
* @audit_info: NetLabel audit information
*
* Description:
* Removes an entry from the domain hash table and handles any updates to the
- * lower level protocol handler (i.e. CIPSO). Returns zero on success,
- * negative on failure.
+ * lower level protocol handler (i.e. CIPSO). @family may be %AF_UNSPEC which
+ * removes all address family entries. Returns zero on success, negative on
+ * failure.
*
*/
-int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
+int netlbl_domhsh_remove(const char *domain, u16 family,
+ struct netlbl_audit *audit_info)
{
- int ret_val;
+ int ret_val = -EINVAL;
struct netlbl_dom_map *entry;
rcu_read_lock();
- if (domain)
- entry = netlbl_domhsh_search(domain);
- else
- entry = netlbl_domhsh_search_def(domain);
- ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
+
+ if (family == AF_INET || family == AF_UNSPEC) {
+ if (domain)
+ entry = netlbl_domhsh_search(domain, AF_INET);
+ else
+ entry = netlbl_domhsh_search_def(domain, AF_INET);
+ ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
+ if (ret_val && ret_val != -ENOENT)
+ goto done;
+ }
+ if (family == AF_INET6 || family == AF_UNSPEC) {
+ int ret_val2;
+
+ if (domain)
+ entry = netlbl_domhsh_search(domain, AF_INET6);
+ else
+ entry = netlbl_domhsh_search_def(domain, AF_INET6);
+ ret_val2 = netlbl_domhsh_remove_entry(entry, audit_info);
+ if (ret_val2 != -ENOENT)
+ ret_val = ret_val2;
+ }
+done:
rcu_read_unlock();
return ret_val;
@@ -651,32 +840,38 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
/**
* netlbl_domhsh_remove_default - Removes the default entry from the table
+ * @family: address family
* @audit_info: NetLabel audit information
*
* Description:
- * Removes/resets the default entry for the domain hash table and handles any
- * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on
- * success, non-zero on failure.
+ * Removes/resets the default entry corresponding to @family from the domain
+ * hash table and handles any updates to the lower level protocol handler
+ * (i.e. CIPSO). @family may be %AF_UNSPEC which removes all address family
+ * entries. Returns zero on success, negative on failure.
*
*/
-int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
+int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info)
{
- return netlbl_domhsh_remove(NULL, audit_info);
+ return netlbl_domhsh_remove(NULL, family, audit_info);
}
/**
* netlbl_domhsh_getentry - Get an entry from the domain hash table
* @domain: the domain name to search for
+ * @family: address family
*
* Description:
* Look through the domain hash table searching for an entry to match @domain,
- * return a pointer to a copy of the entry or NULL. The caller is responsible
- * for ensuring that rcu_read_[un]lock() is called.
+ * with address family @family, return a pointer to a copy of the entry or
+ * NULL. The caller is responsible for ensuring that rcu_read_[un]lock() is
+ * called.
*
*/
-struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
+struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family)
{
- return netlbl_domhsh_search_def(domain);
+ if (family == AF_UNSPEC)
+ return NULL;
+ return netlbl_domhsh_search_def(domain, family);
}
/**
@@ -696,7 +891,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
struct netlbl_dom_map *dom_iter;
struct netlbl_af4list *addr_iter;
- dom_iter = netlbl_domhsh_search_def(domain);
+ dom_iter = netlbl_domhsh_search_def(domain, AF_INET);
if (dom_iter == NULL)
return NULL;
@@ -726,7 +921,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
struct netlbl_dom_map *dom_iter;
struct netlbl_af6list *addr_iter;
- dom_iter = netlbl_domhsh_search_def(domain);
+ dom_iter = netlbl_domhsh_search_def(domain, AF_INET6);
if (dom_iter == NULL)
return NULL;
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index 680caf4df..1f9247781 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -51,6 +51,7 @@ struct netlbl_dommap_def {
union {
struct netlbl_domaddr_map *addrsel;
struct cipso_v4_doi *cipso;
+ struct calipso_doi *calipso;
};
};
#define netlbl_domhsh_addr4_entry(iter) \
@@ -70,6 +71,7 @@ struct netlbl_domaddr6_map {
struct netlbl_dom_map {
char *domain;
+ u16 family;
struct netlbl_dommap_def def;
u32 valid;
@@ -91,14 +93,23 @@ int netlbl_domhsh_remove_af4(const char *domain,
const struct in_addr *addr,
const struct in_addr *mask,
struct netlbl_audit *audit_info);
-int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
-int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
-struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
+int netlbl_domhsh_remove_af6(const char *domain,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove(const char *domain, u16 family,
+ struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info);
+struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family);
struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
__be32 addr);
#if IS_ENABLED(CONFIG_IPV6)
struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
const struct in6_addr *addr);
+int netlbl_domhsh_remove_af6(const char *domain,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info);
#endif /* IPv6 */
int netlbl_domhsh_walk(u32 *skip_bkt,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index bd007a9fd..28c56b95f 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -37,12 +37,14 @@
#include <net/ipv6.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
+#include <net/calipso.h>
#include <asm/bug.h>
#include <linux/atomic.h>
#include "netlabel_domainhash.h"
#include "netlabel_unlabeled.h"
#include "netlabel_cipso_v4.h"
+#include "netlabel_calipso.h"
#include "netlabel_user.h"
#include "netlabel_mgmt.h"
#include "netlabel_addrlist.h"
@@ -72,12 +74,17 @@ int netlbl_cfg_map_del(const char *domain,
struct netlbl_audit *audit_info)
{
if (addr == NULL && mask == NULL) {
- return netlbl_domhsh_remove(domain, audit_info);
+ return netlbl_domhsh_remove(domain, family, audit_info);
} else if (addr != NULL && mask != NULL) {
switch (family) {
case AF_INET:
return netlbl_domhsh_remove_af4(domain, addr, mask,
audit_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return netlbl_domhsh_remove_af6(domain, addr, mask,
+ audit_info);
+#endif /* IPv6 */
default:
return -EPFNOSUPPORT;
}
@@ -119,6 +126,7 @@ int netlbl_cfg_unlbl_map_add(const char *domain,
if (entry->domain == NULL)
goto cfg_unlbl_map_add_failure;
}
+ entry->family = family;
if (addr == NULL && mask == NULL)
entry->def.type = NETLBL_NLTYPE_UNLABELED;
@@ -345,6 +353,7 @@ int netlbl_cfg_cipsov4_map_add(u32 doi,
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
if (entry == NULL)
goto out_entry;
+ entry->family = AF_INET;
if (domain != NULL) {
entry->domain = kstrdup(domain, GFP_ATOMIC);
if (entry->domain == NULL)
@@ -399,6 +408,139 @@ out_entry:
return ret_val;
}
+/**
+ * netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition
+ * @doi_def: CALIPSO DOI definition
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CALIPSO DOI definition as defined by @doi_def. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return calipso_doi_add(doi_def, audit_info);
+#else /* IPv6 */
+ return -ENOSYS;
+#endif /* IPv6 */
+}
+
+/**
+ * netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition
+ * @doi: CALIPSO DOI
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an existing CALIPSO DOI definition matching @doi. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ calipso_doi_remove(doi, audit_info);
+#endif /* IPv6 */
+}
+
+/**
+ * netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping
+ * @doi: the CALIPSO DOI
+ * @domain: the domain mapping to add
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the
+ * NetLabel subsystem. A @domain value of NULL adds a new default domain
+ * mapping. Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_calipso_map_add(u32 doi,
+ const char *domain,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ int ret_val = -ENOMEM;
+ struct calipso_doi *doi_def;
+ struct netlbl_dom_map *entry;
+ struct netlbl_domaddr_map *addrmap = NULL;
+ struct netlbl_domaddr6_map *addrinfo = NULL;
+
+ doi_def = calipso_doi_getdef(doi);
+ if (doi_def == NULL)
+ return -ENOENT;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ goto out_entry;
+ entry->family = AF_INET6;
+ if (domain != NULL) {
+ entry->domain = kstrdup(domain, GFP_ATOMIC);
+ if (entry->domain == NULL)
+ goto out_domain;
+ }
+
+ if (addr == NULL && mask == NULL) {
+ entry->def.calipso = doi_def;
+ entry->def.type = NETLBL_NLTYPE_CALIPSO;
+ } else if (addr != NULL && mask != NULL) {
+ addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
+ if (addrmap == NULL)
+ goto out_addrmap;
+ INIT_LIST_HEAD(&addrmap->list4);
+ INIT_LIST_HEAD(&addrmap->list6);
+
+ addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
+ if (addrinfo == NULL)
+ goto out_addrinfo;
+ addrinfo->def.calipso = doi_def;
+ addrinfo->def.type = NETLBL_NLTYPE_CALIPSO;
+ addrinfo->list.addr = *addr;
+ addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+ addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+ addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+ addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+ addrinfo->list.mask = *mask;
+ addrinfo->list.valid = 1;
+ ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6);
+ if (ret_val != 0)
+ goto cfg_calipso_map_add_failure;
+
+ entry->def.addrsel = addrmap;
+ entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
+ } else {
+ ret_val = -EINVAL;
+ goto out_addrmap;
+ }
+
+ ret_val = netlbl_domhsh_add(entry, audit_info);
+ if (ret_val != 0)
+ goto cfg_calipso_map_add_failure;
+
+ return 0;
+
+cfg_calipso_map_add_failure:
+ kfree(addrinfo);
+out_addrinfo:
+ kfree(addrmap);
+out_addrmap:
+ kfree(entry->domain);
+out_domain:
+ kfree(entry);
+out_entry:
+ calipso_doi_putdef(doi_def);
+ return ret_val;
+#else /* IPv6 */
+ return -ENOSYS;
+#endif /* IPv6 */
+}
+
/*
* Security Attribute Functions
*/
@@ -519,6 +661,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset)
return -ENOENT;
}
+EXPORT_SYMBOL(netlbl_catmap_walk);
/**
* netlbl_catmap_walkrng - Find the end of a string of set bits
@@ -609,20 +752,19 @@ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
off = catmap->startbit;
*offset = off;
}
- iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_NONE, 0);
+ iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_WALK, 0);
if (iter == NULL) {
*offset = (u32)-1;
return 0;
}
if (off < iter->startbit) {
- off = iter->startbit;
- *offset = off;
+ *offset = iter->startbit;
+ off = 0;
} else
off -= iter->startbit;
-
idx = off / NETLBL_CATMAP_MAPSIZE;
- *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_SIZE);
+ *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_MAPSIZE);
return 0;
}
@@ -655,6 +797,7 @@ int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
return 0;
}
+EXPORT_SYMBOL(netlbl_catmap_setbit);
/**
* netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap
@@ -727,6 +870,76 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
return 0;
}
+/* Bitmap functions
+ */
+
+/**
+ * netlbl_bitmap_walk - Walk a bitmap looking for a bit
+ * @bitmap: the bitmap
+ * @bitmap_len: length in bits
+ * @offset: starting offset
+ * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
+ *
+ * Description:
+ * Starting at @offset, walk the bitmap from left to right until either the
+ * desired bit is found or we reach the end. Return the bit offset, -1 if
+ * not found, or -2 if error.
+ */
+int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
+ u32 offset, u8 state)
+{
+ u32 bit_spot;
+ u32 byte_offset;
+ unsigned char bitmask;
+ unsigned char byte;
+
+ byte_offset = offset / 8;
+ byte = bitmap[byte_offset];
+ bit_spot = offset;
+ bitmask = 0x80 >> (offset % 8);
+
+ while (bit_spot < bitmap_len) {
+ if ((state && (byte & bitmask) == bitmask) ||
+ (state == 0 && (byte & bitmask) == 0))
+ return bit_spot;
+
+ bit_spot++;
+ bitmask >>= 1;
+ if (bitmask == 0) {
+ byte = bitmap[++byte_offset];
+ bitmask = 0x80;
+ }
+ }
+
+ return -1;
+}
+EXPORT_SYMBOL(netlbl_bitmap_walk);
+
+/**
+ * netlbl_bitmap_setbit - Sets a single bit in a bitmap
+ * @bitmap: the bitmap
+ * @bit: the bit
+ * @state: if non-zero, set the bit (1) else clear the bit (0)
+ *
+ * Description:
+ * Set a single bit in the bitmask. Returns zero on success, negative values
+ * on error.
+ */
+void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state)
+{
+ u32 byte_spot;
+ u8 bitmask;
+
+ /* gcc always rounds to zero when doing integer division */
+ byte_spot = bit / 8;
+ bitmask = 0x80 >> (bit % 8);
+ if (state)
+ bitmap[byte_spot] |= bitmask;
+ else
+ bitmap[byte_spot] &= ~bitmask;
+}
+EXPORT_SYMBOL(netlbl_bitmap_setbit);
+
/*
* LSM Functions
*/
@@ -774,7 +987,7 @@ int netlbl_sock_setattr(struct sock *sk,
struct netlbl_dom_map *dom_entry;
rcu_read_lock();
- dom_entry = netlbl_domhsh_getentry(secattr->domain);
+ dom_entry = netlbl_domhsh_getentry(secattr->domain, family);
if (dom_entry == NULL) {
ret_val = -ENOENT;
goto socket_setattr_return;
@@ -799,9 +1012,21 @@ int netlbl_sock_setattr(struct sock *sk,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- /* since we don't support any IPv6 labeling protocols right
- * now we can optimize everything away until we do */
- ret_val = 0;
+ switch (dom_entry->def.type) {
+ case NETLBL_NLTYPE_ADDRSELECT:
+ ret_val = -EDESTADDRREQ;
+ break;
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = calipso_sock_setattr(sk,
+ dom_entry->def.calipso,
+ secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
break;
#endif /* IPv6 */
default:
@@ -828,6 +1053,11 @@ void netlbl_sock_delattr(struct sock *sk)
case AF_INET:
cipso_v4_sock_delattr(sk);
break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ calipso_sock_delattr(sk);
+ break;
+#endif /* IPv6 */
}
}
@@ -854,7 +1084,7 @@ int netlbl_sock_getattr(struct sock *sk,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- ret_val = -ENOMSG;
+ ret_val = calipso_sock_getattr(sk, secattr);
break;
#endif /* IPv6 */
default:
@@ -882,6 +1112,9 @@ int netlbl_conn_setattr(struct sock *sk,
{
int ret_val;
struct sockaddr_in *addr4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 *addr6;
+#endif
struct netlbl_dommap_def *entry;
rcu_read_lock();
@@ -902,7 +1135,7 @@ int netlbl_conn_setattr(struct sock *sk,
case NETLBL_NLTYPE_UNLABELED:
/* just delete the protocols we support for right now
* but we could remove other protocols if needed */
- cipso_v4_sock_delattr(sk);
+ netlbl_sock_delattr(sk);
ret_val = 0;
break;
default:
@@ -911,9 +1144,27 @@ int netlbl_conn_setattr(struct sock *sk,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- /* since we don't support any IPv6 labeling protocols right
- * now we can optimize everything away until we do */
- ret_val = 0;
+ addr6 = (struct sockaddr_in6 *)addr;
+ entry = netlbl_domhsh_getentry_af6(secattr->domain,
+ &addr6->sin6_addr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto conn_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = calipso_sock_setattr(sk,
+ entry->calipso, secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ /* just delete the protocols we support for right now
+ * but we could remove other protocols if needed */
+ netlbl_sock_delattr(sk);
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
break;
#endif /* IPv6 */
default:
@@ -940,12 +1191,13 @@ int netlbl_req_setattr(struct request_sock *req,
{
int ret_val;
struct netlbl_dommap_def *entry;
+ struct inet_request_sock *ireq = inet_rsk(req);
rcu_read_lock();
switch (req->rsk_ops->family) {
case AF_INET:
entry = netlbl_domhsh_getentry_af4(secattr->domain,
- inet_rsk(req)->ir_rmt_addr);
+ ireq->ir_rmt_addr);
if (entry == NULL) {
ret_val = -ENOENT;
goto req_setattr_return;
@@ -956,9 +1208,7 @@ int netlbl_req_setattr(struct request_sock *req,
entry->cipso, secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
- /* just delete the protocols we support for right now
- * but we could remove other protocols if needed */
- cipso_v4_req_delattr(req);
+ netlbl_req_delattr(req);
ret_val = 0;
break;
default:
@@ -967,9 +1217,24 @@ int netlbl_req_setattr(struct request_sock *req,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- /* since we don't support any IPv6 labeling protocols right
- * now we can optimize everything away until we do */
- ret_val = 0;
+ entry = netlbl_domhsh_getentry_af6(secattr->domain,
+ &ireq->ir_v6_rmt_addr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto req_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = calipso_req_setattr(req,
+ entry->calipso, secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ netlbl_req_delattr(req);
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
break;
#endif /* IPv6 */
default:
@@ -995,6 +1260,11 @@ void netlbl_req_delattr(struct request_sock *req)
case AF_INET:
cipso_v4_req_delattr(req);
break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ calipso_req_delattr(req);
+ break;
+#endif /* IPv6 */
}
}
@@ -1015,13 +1285,17 @@ int netlbl_skbuff_setattr(struct sk_buff *skb,
{
int ret_val;
struct iphdr *hdr4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct ipv6hdr *hdr6;
+#endif
struct netlbl_dommap_def *entry;
rcu_read_lock();
switch (family) {
case AF_INET:
hdr4 = ip_hdr(skb);
- entry = netlbl_domhsh_getentry_af4(secattr->domain,hdr4->daddr);
+ entry = netlbl_domhsh_getentry_af4(secattr->domain,
+ hdr4->daddr);
if (entry == NULL) {
ret_val = -ENOENT;
goto skbuff_setattr_return;
@@ -1042,9 +1316,26 @@ int netlbl_skbuff_setattr(struct sk_buff *skb,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- /* since we don't support any IPv6 labeling protocols right
- * now we can optimize everything away until we do */
- ret_val = 0;
+ hdr6 = ipv6_hdr(skb);
+ entry = netlbl_domhsh_getentry_af6(secattr->domain,
+ &hdr6->daddr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto skbuff_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = calipso_skbuff_setattr(skb, entry->calipso,
+ secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ /* just delete the protocols we support for right now
+ * but we could remove other protocols if needed */
+ ret_val = calipso_skbuff_delattr(skb);
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
break;
#endif /* IPv6 */
default:
@@ -1083,6 +1374,9 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
+ ptr = calipso_optptr(skb);
+ if (ptr && calipso_getattr(ptr, secattr) == 0)
+ return 0;
break;
#endif /* IPv6 */
}
@@ -1093,6 +1387,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
/**
* netlbl_skbuff_err - Handle a LSM error on a sk_buff
* @skb: the packet
+ * @family: the family
* @error: the error code
* @gateway: true if host is acting as a gateway, false otherwise
*
@@ -1102,10 +1397,14 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
* according to the packet's labeling protocol.
*
*/
-void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway)
+void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway)
{
- if (cipso_v4_optptr(skb))
- cipso_v4_error(skb, error, gateway);
+ switch (family) {
+ case AF_INET:
+ if (cipso_v4_optptr(skb))
+ cipso_v4_error(skb, error, gateway);
+ break;
+ }
}
/**
@@ -1120,11 +1419,15 @@ void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway)
void netlbl_cache_invalidate(void)
{
cipso_v4_cache_invalidate();
+#if IS_ENABLED(CONFIG_IPV6)
+ calipso_cache_invalidate();
+#endif /* IPv6 */
}
/**
* netlbl_cache_add - Add an entry to a NetLabel protocol cache
* @skb: the packet
+ * @family: the family
* @secattr: the packet's security attributes
*
* Description:
@@ -1133,7 +1436,7 @@ void netlbl_cache_invalidate(void)
* values on error.
*
*/
-int netlbl_cache_add(const struct sk_buff *skb,
+int netlbl_cache_add(const struct sk_buff *skb, u16 family,
const struct netlbl_lsm_secattr *secattr)
{
unsigned char *ptr;
@@ -1141,10 +1444,20 @@ int netlbl_cache_add(const struct sk_buff *skb,
if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0)
return -ENOMSG;
- ptr = cipso_v4_optptr(skb);
- if (ptr)
- return cipso_v4_cache_add(ptr, secattr);
-
+ switch (family) {
+ case AF_INET:
+ ptr = cipso_v4_optptr(skb);
+ if (ptr)
+ return cipso_v4_cache_add(ptr, secattr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ ptr = calipso_optptr(skb);
+ if (ptr)
+ return calipso_cache_add(ptr, secattr);
+ break;
+#endif /* IPv6 */
+ }
return -ENOMSG;
}
@@ -1169,6 +1482,7 @@ struct audit_buffer *netlbl_audit_start(int type,
{
return netlbl_audit_start_common(type, audit_info);
}
+EXPORT_SYMBOL(netlbl_audit_start);
/*
* Setup Functions
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 13f777f20..f85d0e07a 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -41,8 +41,10 @@
#include <net/ipv6.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
+#include <net/calipso.h>
#include <linux/atomic.h>
+#include "netlabel_calipso.h"
#include "netlabel_domainhash.h"
#include "netlabel_user.h"
#include "netlabel_mgmt.h"
@@ -72,6 +74,8 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
[NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 },
[NLBL_MGMT_A_VERSION] = { .type = NLA_U32 },
[NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 },
+ [NLBL_MGMT_A_FAMILY] = { .type = NLA_U16 },
+ [NLBL_MGMT_A_CLPDOI] = { .type = NLA_U32 },
};
/*
@@ -95,6 +99,9 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
int ret_val = -EINVAL;
struct netlbl_domaddr_map *addrmap = NULL;
struct cipso_v4_doi *cipsov4 = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct calipso_doi *calipso = NULL;
+#endif
u32 tmp_val;
struct netlbl_dom_map *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
@@ -119,6 +126,11 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
switch (entry->def.type) {
case NETLBL_NLTYPE_UNLABELED:
+ if (info->attrs[NLBL_MGMT_A_FAMILY])
+ entry->family =
+ nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]);
+ else
+ entry->family = AF_UNSPEC;
break;
case NETLBL_NLTYPE_CIPSOV4:
if (!info->attrs[NLBL_MGMT_A_CV4DOI])
@@ -128,12 +140,30 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
cipsov4 = cipso_v4_doi_getdef(tmp_val);
if (cipsov4 == NULL)
goto add_free_domain;
+ entry->family = AF_INET;
entry->def.cipso = cipsov4;
break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NETLBL_NLTYPE_CALIPSO:
+ if (!info->attrs[NLBL_MGMT_A_CLPDOI])
+ goto add_free_domain;
+
+ tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CLPDOI]);
+ calipso = calipso_doi_getdef(tmp_val);
+ if (calipso == NULL)
+ goto add_free_domain;
+ entry->family = AF_INET6;
+ entry->def.calipso = calipso;
+ break;
+#endif /* IPv6 */
default:
goto add_free_domain;
}
+ if ((entry->family == AF_INET && info->attrs[NLBL_MGMT_A_IPV6ADDR]) ||
+ (entry->family == AF_INET6 && info->attrs[NLBL_MGMT_A_IPV4ADDR]))
+ goto add_doi_put_def;
+
if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) {
struct in_addr *addr;
struct in_addr *mask;
@@ -178,6 +208,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
goto add_free_addrmap;
}
+ entry->family = AF_INET;
entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
entry->def.addrsel = addrmap;
#if IS_ENABLED(CONFIG_IPV6)
@@ -220,6 +251,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
map->list.mask = *mask;
map->list.valid = 1;
map->def.type = entry->def.type;
+ if (calipso)
+ map->def.calipso = calipso;
ret_val = netlbl_af6list_add(&map->list, &addrmap->list6);
if (ret_val != 0) {
@@ -227,6 +260,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
goto add_free_addrmap;
}
+ entry->family = AF_INET6;
entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
entry->def.addrsel = addrmap;
#endif /* IPv6 */
@@ -242,6 +276,9 @@ add_free_addrmap:
kfree(addrmap);
add_doi_put_def:
cipso_v4_doi_putdef(cipsov4);
+#if IS_ENABLED(CONFIG_IPV6)
+ calipso_doi_putdef(calipso);
+#endif
add_free_domain:
kfree(entry->domain);
add_free_entry:
@@ -278,6 +315,10 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb,
return ret_val;
}
+ ret_val = nla_put_u16(skb, NLBL_MGMT_A_FAMILY, entry->family);
+ if (ret_val != 0)
+ return ret_val;
+
switch (entry->def.type) {
case NETLBL_NLTYPE_ADDRSELECT:
nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST);
@@ -340,6 +381,15 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb,
if (ret_val != 0)
return ret_val;
+ switch (map6->def.type) {
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI,
+ map6->def.calipso->doi);
+ if (ret_val != 0)
+ return ret_val;
+ break;
+ }
+
nla_nest_end(skb, nla_b);
}
#endif /* IPv6 */
@@ -347,15 +397,25 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb,
nla_nest_end(skb, nla_a);
break;
case NETLBL_NLTYPE_UNLABELED:
- ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type);
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+ entry->def.type);
break;
case NETLBL_NLTYPE_CIPSOV4:
- ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type);
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+ entry->def.type);
if (ret_val != 0)
return ret_val;
ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI,
entry->def.cipso->doi);
break;
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+ entry->def.type);
+ if (ret_val != 0)
+ return ret_val;
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI,
+ entry->def.calipso->doi);
+ break;
}
return ret_val;
@@ -418,7 +478,7 @@ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info)
netlbl_netlink_auditinfo(skb, &audit_info);
domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]);
- return netlbl_domhsh_remove(domain, &audit_info);
+ return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info);
}
/**
@@ -536,7 +596,7 @@ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info)
netlbl_netlink_auditinfo(skb, &audit_info);
- return netlbl_domhsh_remove_default(&audit_info);
+ return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info);
}
/**
@@ -556,6 +616,12 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *ans_skb = NULL;
void *data;
struct netlbl_dom_map *entry;
+ u16 family;
+
+ if (info->attrs[NLBL_MGMT_A_FAMILY])
+ family = nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]);
+ else
+ family = AF_INET;
ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (ans_skb == NULL)
@@ -566,7 +632,7 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
goto listdef_failure;
rcu_read_lock();
- entry = netlbl_domhsh_getentry(NULL);
+ entry = netlbl_domhsh_getentry(NULL, family);
if (entry == NULL) {
ret_val = -ENOENT;
goto listdef_failure_lock;
@@ -651,6 +717,15 @@ static int netlbl_mgmt_protocols(struct sk_buff *skb,
goto protocols_return;
protos_sent++;
}
+#if IS_ENABLED(CONFIG_IPV6)
+ if (protos_sent == 2) {
+ if (netlbl_mgmt_protocols_cb(skb,
+ cb,
+ NETLBL_NLTYPE_CALIPSO) < 0)
+ goto protocols_return;
+ protos_sent++;
+ }
+#endif
protocols_return:
cb->args[0] = protos_sent;
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
index 8b6e1ab62..ea01e42bc 100644
--- a/net/netlabel/netlabel_mgmt.h
+++ b/net/netlabel/netlabel_mgmt.h
@@ -58,7 +58,10 @@
*
* NLBL_MGMT_A_CV4DOI
*
- * If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ * If using NETLBL_NLTYPE_UNLABELED no other attributes are required,
+ * however the following attribute may optionally be sent:
+ *
+ * NLBL_MGMT_A_FAMILY
*
* o REMOVE:
* Sent by an application to remove a domain mapping from the NetLabel
@@ -77,6 +80,7 @@
* Required attributes:
*
* NLBL_MGMT_A_DOMAIN
+ * NLBL_MGMT_A_FAMILY
*
* If the IP address selectors are not used the following attribute is
* required:
@@ -108,7 +112,10 @@
*
* NLBL_MGMT_A_CV4DOI
*
- * If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ * If using NETLBL_NLTYPE_UNLABELED no other attributes are required,
+ * however the following attribute may optionally be sent:
+ *
+ * NLBL_MGMT_A_FAMILY
*
* o REMOVEDEF:
* Sent by an application to remove the default domain mapping from the
@@ -117,13 +124,17 @@
* o LISTDEF:
* This message can be sent either from an application or by the kernel in
* response to an application generated LISTDEF message. When sent by an
- * application there is no payload. On success the kernel should send a
- * response using the following format.
+ * application there may be an optional payload.
*
- * If the IP address selectors are not used the following attribute is
+ * NLBL_MGMT_A_FAMILY
+ *
+ * On success the kernel should send a response using the following format:
+ *
+ * If the IP address selectors are not used the following attributes are
* required:
*
* NLBL_MGMT_A_PROTOCOL
+ * NLBL_MGMT_A_FAMILY
*
* If the IP address selectors are used then the following attritbute is
* required:
@@ -209,6 +220,12 @@ enum {
/* (NLA_NESTED)
* the selector list, there must be at least one
* NLBL_MGMT_A_ADDRSELECTOR attribute */
+ NLBL_MGMT_A_FAMILY,
+ /* (NLA_U16)
+ * The address family */
+ NLBL_MGMT_A_CLPDOI,
+ /* (NLA_U32)
+ * the CALIPSO DOI value */
__NLBL_MGMT_A_MAX,
};
#define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 9eaa9a1e8..4528cff91 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -116,8 +116,8 @@ struct netlbl_unlhsh_walk_arg {
static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
#define netlbl_unlhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
-static struct netlbl_unlhsh_tbl *netlbl_unlhsh;
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_def;
+static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh;
+static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def;
/* Accept unlabeled packets flag */
static u8 netlabel_unlabel_acceptflg;
@@ -1537,6 +1537,7 @@ int __init netlbl_unlabel_defconf(void)
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (entry == NULL)
return -ENOMEM;
+ entry->family = AF_UNSPEC;
entry->def.type = NETLBL_NLTYPE_UNLABELED;
ret_val = netlbl_domhsh_add_default(entry, &audit_info);
if (ret_val != 0)
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index adf8b7900..58495f44c 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -44,6 +44,7 @@
#include "netlabel_mgmt.h"
#include "netlabel_unlabeled.h"
#include "netlabel_cipso_v4.h"
+#include "netlabel_calipso.h"
#include "netlabel_user.h"
/*
@@ -71,6 +72,10 @@ int __init netlbl_netlink_init(void)
if (ret_val != 0)
return ret_val;
+ ret_val = netlbl_calipso_genl_init();
+ if (ret_val != 0)
+ return ret_val;
+
return netlbl_unlabel_genl_init();
}
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index e68ef9ccd..3cfd6cc60 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -8,20 +8,6 @@
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
-struct netlink_ring {
- void **pg_vec;
- unsigned int head;
- unsigned int frames_per_block;
- unsigned int frame_size;
- unsigned int frame_max;
-
- unsigned int pg_vec_order;
- unsigned int pg_vec_pages;
- unsigned int pg_vec_len;
-
- atomic_t pending;
-};
-
struct netlink_sock {
/* struct sock has to be the first member of netlink_sock */
struct sock sk;
diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c
index dd9003f38..0fd5518bf 100644
--- a/net/nfc/digital_core.c
+++ b/net/nfc/digital_core.c
@@ -30,6 +30,9 @@
#define DIGITAL_PROTO_ISO15693_RF_TECH NFC_PROTO_ISO15693_MASK
+/* Delay between each poll frame (ms) */
+#define DIGITAL_POLL_INTERVAL 10
+
struct digital_cmd {
struct list_head queue;
@@ -173,6 +176,8 @@ static void digital_wq_cmd(struct work_struct *work)
return;
}
+ cmd->pending = 1;
+
mutex_unlock(&ddev->cmd_lock);
if (cmd->req)
@@ -419,7 +424,8 @@ void digital_poll_next_tech(struct nfc_digital_dev *ddev)
mutex_unlock(&ddev->poll_lock);
- schedule_work(&ddev->poll_work);
+ schedule_delayed_work(&ddev->poll_work,
+ msecs_to_jiffies(DIGITAL_POLL_INTERVAL));
}
static void digital_wq_poll(struct work_struct *work)
@@ -428,7 +434,7 @@ static void digital_wq_poll(struct work_struct *work)
struct digital_poll_tech *poll_tech;
struct nfc_digital_dev *ddev = container_of(work,
struct nfc_digital_dev,
- poll_work);
+ poll_work.work);
mutex_lock(&ddev->poll_lock);
if (!ddev->poll_tech_count) {
@@ -543,7 +549,7 @@ static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols,
return -EINVAL;
}
- schedule_work(&ddev->poll_work);
+ schedule_delayed_work(&ddev->poll_work, 0);
return 0;
}
@@ -564,7 +570,7 @@ static void digital_stop_poll(struct nfc_dev *nfc_dev)
mutex_unlock(&ddev->poll_lock);
- cancel_work_sync(&ddev->poll_work);
+ cancel_delayed_work_sync(&ddev->poll_work);
digital_abort_cmd(ddev);
}
@@ -606,6 +612,8 @@ static int digital_dep_link_down(struct nfc_dev *nfc_dev)
{
struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+ digital_abort_cmd(ddev);
+
ddev->curr_protocol = 0;
return 0;
@@ -770,7 +778,7 @@ struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops,
INIT_WORK(&ddev->cmd_complete_work, digital_wq_cmd_complete);
mutex_init(&ddev->poll_lock);
- INIT_WORK(&ddev->poll_work, digital_wq_poll);
+ INIT_DELAYED_WORK(&ddev->poll_work, digital_wq_poll);
if (supported_protocols & NFC_PROTO_JEWEL_MASK)
ddev->protocols |= NFC_PROTO_JEWEL_MASK;
@@ -832,12 +840,20 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev)
ddev->poll_tech_count = 0;
mutex_unlock(&ddev->poll_lock);
- cancel_work_sync(&ddev->poll_work);
+ cancel_delayed_work_sync(&ddev->poll_work);
cancel_work_sync(&ddev->cmd_work);
cancel_work_sync(&ddev->cmd_complete_work);
list_for_each_entry_safe(cmd, n, &ddev->cmd_queue, queue) {
list_del(&cmd->queue);
+
+ /* Call the command callback if any and pass it a ENODEV error.
+ * This gives a chance to the command issuer to free any
+ * allocated buffer.
+ */
+ if (cmd->cmd_cb)
+ cmd->cmd_cb(ddev, cmd->cb_context, ERR_PTR(-ENODEV));
+
kfree(cmd->mdaa_params);
kfree(cmd);
}
diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
index f72be7433..f864ce19e 100644
--- a/net/nfc/digital_dep.c
+++ b/net/nfc/digital_dep.c
@@ -35,6 +35,8 @@
#define DIGITAL_ATR_REQ_MIN_SIZE 16
#define DIGITAL_ATR_REQ_MAX_SIZE 64
+#define DIGITAL_ATR_RES_TO_WT(s) ((s) & 0xF)
+
#define DIGITAL_DID_MAX 14
#define DIGITAL_PAYLOAD_SIZE_MAX 254
@@ -63,6 +65,9 @@
#define DIGITAL_NFC_DEP_DID_BIT_SET(pfb) ((pfb) & DIGITAL_NFC_DEP_PFB_DID_BIT)
#define DIGITAL_NFC_DEP_PFB_PNI(pfb) ((pfb) & 0x03)
+#define DIGITAL_NFC_DEP_RTOX_VALUE(data) ((data) & 0x3F)
+#define DIGITAL_NFC_DEP_RTOX_MAX 59
+
#define DIGITAL_NFC_DEP_PFB_I_PDU 0x00
#define DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU 0x40
#define DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU 0x80
@@ -122,6 +127,37 @@ static const u8 digital_payload_bits_map[4] = {
[3] = 254
};
+/* Response Waiting Time for ATR_RES PDU in ms
+ *
+ * RWT(ATR_RES) = RWT(nfcdep,activation) + dRWT(nfcdep) + dT(nfcdep,initiator)
+ *
+ * with:
+ * RWT(nfcdep,activation) = 4096 * 2^12 / f(c) s
+ * dRWT(nfcdep) = 16 / f(c) s
+ * dT(nfcdep,initiator) = 100 ms
+ * f(c) = 13560000 Hz
+ */
+#define DIGITAL_ATR_RES_RWT 1337
+
+/* Response Waiting Time for other DEP PDUs in ms
+ *
+ * max_rwt = rwt + dRWT(nfcdep) + dT(nfcdep,initiator)
+ *
+ * with:
+ * rwt = (256 * 16 / f(c)) * 2^wt s
+ * dRWT(nfcdep) = 16 / f(c) s
+ * dT(nfcdep,initiator) = 100 ms
+ * f(c) = 13560000 Hz
+ * 0 <= wt <= 14 (given by the target by the TO field of ATR_RES response)
+ */
+#define DIGITAL_NFC_DEP_IN_MAX_WT 14
+#define DIGITAL_NFC_DEP_TG_MAX_WT 8
+static const u16 digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT + 1] = {
+ 100, 101, 101, 102, 105,
+ 110, 119, 139, 177, 255,
+ 409, 719, 1337, 2575, 5049,
+};
+
static u8 digital_payload_bits_to_size(u8 payload_bits)
{
if (payload_bits >= ARRAY_SIZE(digital_payload_bits_map))
@@ -190,8 +226,6 @@ digital_send_dep_data_prep(struct nfc_digital_dev *ddev, struct sk_buff *skb,
return ERR_PTR(-ENOMEM);
}
- skb_reserve(new_skb, ddev->tx_headroom + NFC_HEADER_SIZE +
- DIGITAL_NFC_DEP_REQ_RES_HEADROOM);
memcpy(skb_put(new_skb, ddev->remote_payload_max), skb->data,
ddev->remote_payload_max);
skb_pull(skb, ddev->remote_payload_max);
@@ -368,8 +402,8 @@ static int digital_in_send_psl_req(struct nfc_digital_dev *ddev,
ddev->skb_add_crc(skb);
- rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_psl_res,
- target);
+ rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+ digital_in_recv_psl_res, target);
if (rc)
kfree_skb(skb);
@@ -382,6 +416,7 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg,
struct nfc_target *target = arg;
struct digital_atr_res *atr_res;
u8 gb_len, payload_bits;
+ u8 wt;
int rc;
if (IS_ERR(resp)) {
@@ -411,6 +446,11 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg,
atr_res = (struct digital_atr_res *)resp->data;
+ wt = DIGITAL_ATR_RES_TO_WT(atr_res->to);
+ if (wt > DIGITAL_NFC_DEP_IN_MAX_WT)
+ wt = DIGITAL_NFC_DEP_IN_MAX_WT;
+ ddev->dep_rwt = digital_rwt_map[wt];
+
payload_bits = DIGITAL_PAYLOAD_PP_TO_BITS(atr_res->pp);
ddev->remote_payload_max = digital_payload_bits_to_size(payload_bits);
@@ -492,8 +532,8 @@ int digital_in_send_atr_req(struct nfc_digital_dev *ddev,
ddev->skb_add_crc(skb);
- rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_atr_res,
- target);
+ rc = digital_in_send_cmd(ddev, skb, DIGITAL_ATR_RES_RWT,
+ digital_in_recv_atr_res, target);
if (rc)
kfree_skb(skb);
@@ -524,11 +564,10 @@ static int digital_in_send_ack(struct nfc_digital_dev *ddev,
ddev->skb_add_crc(skb);
- ddev->saved_skb = skb_get(skb);
- ddev->saved_skb_len = skb->len;
+ ddev->saved_skb = pskb_copy(skb, GFP_KERNEL);
- rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
- data_exch);
+ rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+ digital_in_recv_dep_res, data_exch);
if (rc) {
kfree_skb(skb);
kfree_skb(ddev->saved_skb);
@@ -562,8 +601,8 @@ static int digital_in_send_nack(struct nfc_digital_dev *ddev,
ddev->skb_add_crc(skb);
- rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
- data_exch);
+ rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+ digital_in_recv_dep_res, data_exch);
if (rc)
kfree_skb(skb);
@@ -593,8 +632,8 @@ static int digital_in_send_atn(struct nfc_digital_dev *ddev,
ddev->skb_add_crc(skb);
- rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
- data_exch);
+ rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+ digital_in_recv_dep_res, data_exch);
if (rc)
kfree_skb(skb);
@@ -607,6 +646,11 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev,
struct digital_dep_req_res *dep_req;
struct sk_buff *skb;
int rc;
+ u16 rwt_int;
+
+ rwt_int = ddev->dep_rwt * rtox;
+ if (rwt_int > digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT])
+ rwt_int = digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT];
skb = digital_skb_alloc(ddev, 1);
if (!skb)
@@ -627,16 +671,10 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev,
ddev->skb_add_crc(skb);
- ddev->saved_skb = skb_get(skb);
- ddev->saved_skb_len = skb->len;
-
- rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
- data_exch);
- if (rc) {
+ rc = digital_in_send_cmd(ddev, skb, rwt_int,
+ digital_in_recv_dep_res, data_exch);
+ if (rc)
kfree_skb(skb);
- kfree_skb(ddev->saved_skb);
- ddev->saved_skb = NULL;
- }
return rc;
}
@@ -644,11 +682,19 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev,
static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev,
struct digital_data_exch *data_exch)
{
+ int rc;
+
+ if (!ddev->saved_skb)
+ return -EINVAL;
+
skb_get(ddev->saved_skb);
- skb_push(ddev->saved_skb, ddev->saved_skb_len);
- return digital_in_send_cmd(ddev, ddev->saved_skb, 1500,
- digital_in_recv_dep_res, data_exch);
+ rc = digital_in_send_cmd(ddev, ddev->saved_skb, ddev->dep_rwt,
+ digital_in_recv_dep_res, data_exch);
+ if (rc)
+ kfree_skb(ddev->saved_skb);
+
+ return rc;
}
static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg,
@@ -659,12 +705,13 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg,
u8 pfb;
uint size;
int rc;
+ u8 rtox;
if (IS_ERR(resp)) {
rc = PTR_ERR(resp);
resp = NULL;
- if (((rc != -ETIMEDOUT) || ddev->nack_count) &&
+ if ((rc == -EIO || (rc == -ETIMEDOUT && ddev->nack_count)) &&
(ddev->nack_count++ < DIGITAL_NFC_DEP_N_RETRY_NACK)) {
ddev->atn_count = 0;
@@ -783,6 +830,12 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg,
break;
case DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU:
+ if (DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) {
+ PROTOCOL_ERR("14.12.4.5");
+ rc = -EIO;
+ goto exit;
+ }
+
if (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni) {
PROTOCOL_ERR("14.12.3.3");
rc = -EIO;
@@ -792,43 +845,53 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg,
ddev->curr_nfc_dep_pni =
DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni + 1);
- if (ddev->chaining_skb && !DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) {
- kfree_skb(ddev->saved_skb);
- ddev->saved_skb = NULL;
+ if (!ddev->chaining_skb) {
+ PROTOCOL_ERR("14.12.4.3");
+ rc = -EIO;
+ goto exit;
+ }
- rc = digital_in_send_dep_req(ddev, NULL,
- ddev->chaining_skb,
- ddev->data_exch);
- if (rc)
- goto error;
+ /* The initiator has received a valid ACK. Free the last sent
+ * PDU and keep on sending chained skb.
+ */
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
- return;
- }
+ rc = digital_in_send_dep_req(ddev, NULL,
+ ddev->chaining_skb,
+ ddev->data_exch);
+ if (rc)
+ goto error;
- pr_err("Received a ACK/NACK PDU\n");
- rc = -EINVAL;
- goto exit;
+ goto free_resp;
case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU:
if (!DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { /* ATN */
rc = digital_in_send_saved_skb(ddev, data_exch);
- if (rc) {
- kfree_skb(ddev->saved_skb);
+ if (rc)
goto error;
- }
- return;
+ goto free_resp;
}
- kfree_skb(ddev->saved_skb);
- ddev->saved_skb = NULL;
+ if (ddev->atn_count || ddev->nack_count) {
+ PROTOCOL_ERR("14.12.4.4");
+ rc = -EIO;
+ goto error;
+ }
+
+ rtox = DIGITAL_NFC_DEP_RTOX_VALUE(resp->data[0]);
+ if (!rtox || rtox > DIGITAL_NFC_DEP_RTOX_MAX) {
+ PROTOCOL_ERR("14.8.4.1");
+ rc = -EIO;
+ goto error;
+ }
- rc = digital_in_send_rtox(ddev, data_exch, resp->data[0]);
+ rc = digital_in_send_rtox(ddev, data_exch, rtox);
if (rc)
goto error;
- kfree_skb(resp);
- return;
+ goto free_resp;
}
exit:
@@ -845,6 +908,11 @@ error:
if (rc)
kfree_skb(resp);
+
+ return;
+
+free_resp:
+ dev_kfree_skb(resp);
}
int digital_in_send_dep_req(struct nfc_digital_dev *ddev,
@@ -876,11 +944,10 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev,
ddev->skb_add_crc(tmp_skb);
- ddev->saved_skb = skb_get(tmp_skb);
- ddev->saved_skb_len = tmp_skb->len;
+ ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL);
- rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res,
- data_exch);
+ rc = digital_in_send_cmd(ddev, tmp_skb, ddev->dep_rwt,
+ digital_in_recv_dep_res, data_exch);
if (rc) {
if (tmp_skb != skb)
kfree_skb(tmp_skb);
@@ -956,8 +1023,7 @@ static int digital_tg_send_ack(struct nfc_digital_dev *ddev,
ddev->skb_add_crc(skb);
- ddev->saved_skb = skb_get(skb);
- ddev->saved_skb_len = skb->len;
+ ddev->saved_skb = pskb_copy(skb, GFP_KERNEL);
rc = digital_tg_send_cmd(ddev, skb, 1500, digital_tg_recv_dep_req,
data_exch);
@@ -1009,11 +1075,19 @@ static int digital_tg_send_atn(struct nfc_digital_dev *ddev)
static int digital_tg_send_saved_skb(struct nfc_digital_dev *ddev)
{
+ int rc;
+
+ if (!ddev->saved_skb)
+ return -EINVAL;
+
skb_get(ddev->saved_skb);
- skb_push(ddev->saved_skb, ddev->saved_skb_len);
- return digital_tg_send_cmd(ddev, ddev->saved_skb, 1500,
- digital_tg_recv_dep_req, NULL);
+ rc = digital_tg_send_cmd(ddev, ddev->saved_skb, 1500,
+ digital_tg_recv_dep_req, NULL);
+ if (rc)
+ kfree_skb(ddev->saved_skb);
+
+ return rc;
}
static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg,
@@ -1086,22 +1160,38 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg,
case DIGITAL_NFC_DEP_PFB_I_PDU:
pr_debug("DIGITAL_NFC_DEP_PFB_I_PDU\n");
- if ((ddev->atn_count && (DIGITAL_NFC_DEP_PFB_PNI(pfb - 1) !=
- ddev->curr_nfc_dep_pni)) ||
- (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni)) {
- PROTOCOL_ERR("14.12.3.4");
- rc = -EIO;
- goto exit;
- }
-
if (ddev->atn_count) {
+ /* The target has received (and replied to) at least one
+ * ATN DEP_REQ.
+ */
ddev->atn_count = 0;
- rc = digital_tg_send_saved_skb(ddev);
- if (rc)
- goto exit;
+ /* pni of resp PDU equal to the target current pni - 1
+ * means resp is the previous DEP_REQ PDU received from
+ * the initiator so the target replies with saved_skb
+ * which is the previous DEP_RES saved in
+ * digital_tg_send_dep_res().
+ */
+ if (DIGITAL_NFC_DEP_PFB_PNI(pfb) ==
+ DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni - 1)) {
+ rc = digital_tg_send_saved_skb(ddev);
+ if (rc)
+ goto exit;
- return;
+ goto free_resp;
+ }
+
+ /* atn_count > 0 and PDU pni != curr_nfc_dep_pni - 1
+ * means the target probably did not received the last
+ * DEP_REQ PDU sent by the initiator. The target
+ * fallbacks to normal processing then.
+ */
+ }
+
+ if (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni) {
+ PROTOCOL_ERR("14.12.3.4");
+ rc = -EIO;
+ goto exit;
}
kfree_skb(ddev->saved_skb);
@@ -1125,51 +1215,64 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg,
rc = 0;
break;
case DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU:
- if (!DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { /* ACK */
- if ((ddev->atn_count &&
- (DIGITAL_NFC_DEP_PFB_PNI(pfb - 1) !=
- ddev->curr_nfc_dep_pni)) ||
- (DIGITAL_NFC_DEP_PFB_PNI(pfb) !=
- ddev->curr_nfc_dep_pni) ||
- !ddev->chaining_skb || !ddev->saved_skb) {
+ if (DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { /* NACK */
+ if (DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) !=
+ ddev->curr_nfc_dep_pni) {
rc = -EIO;
goto exit;
}
- if (ddev->atn_count) {
- ddev->atn_count = 0;
+ ddev->atn_count = 0;
+ rc = digital_tg_send_saved_skb(ddev);
+ if (rc)
+ goto exit;
+
+ goto free_resp;
+ }
+
+ /* ACK */
+ if (ddev->atn_count) {
+ /* The target has previously recevied one or more ATN
+ * PDUs.
+ */
+ ddev->atn_count = 0;
+
+ /* If the ACK PNI is equal to the target PNI - 1 means
+ * that the initiator did not receive the previous PDU
+ * sent by the target so re-send it.
+ */
+ if (DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) ==
+ ddev->curr_nfc_dep_pni) {
rc = digital_tg_send_saved_skb(ddev);
if (rc)
goto exit;
- return;
+ goto free_resp;
}
- kfree_skb(ddev->saved_skb);
- ddev->saved_skb = NULL;
+ /* Otherwise, the target did not receive the previous
+ * ACK PDU from the initiator. Fallback to normal
+ * processing of chained PDU then.
+ */
+ }
- rc = digital_tg_send_dep_res(ddev, ddev->chaining_skb);
- if (rc)
- goto exit;
- } else { /* NACK */
- if ((DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) !=
- ddev->curr_nfc_dep_pni) ||
- !ddev->saved_skb) {
- rc = -EIO;
- goto exit;
- }
+ /* Keep on sending chained PDU */
+ if (!ddev->chaining_skb ||
+ DIGITAL_NFC_DEP_PFB_PNI(pfb) !=
+ ddev->curr_nfc_dep_pni) {
+ rc = -EIO;
+ goto exit;
+ }
- ddev->atn_count = 0;
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
- rc = digital_tg_send_saved_skb(ddev);
- if (rc) {
- kfree_skb(ddev->saved_skb);
- goto exit;
- }
- }
+ rc = digital_tg_send_dep_res(ddev, ddev->chaining_skb);
+ if (rc)
+ goto exit;
- return;
+ goto free_resp;
case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU:
if (DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) {
rc = -EINVAL;
@@ -1182,8 +1285,7 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg,
ddev->atn_count++;
- kfree_skb(resp);
- return;
+ goto free_resp;
}
rc = nfc_tm_data_received(ddev->nfc_dev, resp);
@@ -1199,6 +1301,11 @@ exit:
if (rc)
kfree_skb(resp);
+
+ return;
+
+free_resp:
+ dev_kfree_skb(resp);
}
int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb)
@@ -1235,8 +1342,7 @@ int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb)
ddev->skb_add_crc(tmp_skb);
- ddev->saved_skb = skb_get(tmp_skb);
- ddev->saved_skb_len = tmp_skb->len;
+ ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL);
rc = digital_tg_send_cmd(ddev, tmp_skb, 1500, digital_tg_recv_dep_req,
NULL);
@@ -1420,7 +1526,7 @@ static int digital_tg_send_atr_res(struct nfc_digital_dev *ddev,
atr_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN;
atr_res->cmd = DIGITAL_CMD_ATR_RES;
memcpy(atr_res->nfcid3, atr_req->nfcid3, sizeof(atr_req->nfcid3));
- atr_res->to = 8;
+ atr_res->to = DIGITAL_NFC_DEP_TG_MAX_WT;
ddev->local_payload_max = DIGITAL_PAYLOAD_SIZE_MAX;
payload_bits = digital_payload_size_to_bits(ddev->local_payload_max);
diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c
index fb58ed2dd..d9080dec5 100644
--- a/net/nfc/digital_technology.c
+++ b/net/nfc/digital_technology.c
@@ -1257,21 +1257,12 @@ static int digital_tg_config_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech)
int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech)
{
int rc;
- u8 *nfcid2;
rc = digital_tg_config_nfcf(ddev, rf_tech);
if (rc)
return rc;
- nfcid2 = kzalloc(NFC_NFCID2_MAXSIZE, GFP_KERNEL);
- if (!nfcid2)
- return -ENOMEM;
-
- nfcid2[0] = DIGITAL_SENSF_NFCID2_NFC_DEP_B1;
- nfcid2[1] = DIGITAL_SENSF_NFCID2_NFC_DEP_B2;
- get_random_bytes(nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2);
-
- return digital_tg_listen(ddev, 300, digital_tg_recv_sensf_req, nfcid2);
+ return digital_tg_listen(ddev, 300, digital_tg_recv_sensf_req, NULL);
}
void digital_tg_recv_md_req(struct nfc_digital_dev *ddev, void *arg,
diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c
index 1399a03fa..3d699cbc7 100644
--- a/net/nfc/hci/llc.c
+++ b/net/nfc/hci/llc.c
@@ -133,36 +133,29 @@ void nfc_llc_free(struct nfc_llc *llc)
kfree(llc);
}
-inline void nfc_llc_get_rx_head_tail_room(struct nfc_llc *llc, int *rx_headroom,
- int *rx_tailroom)
-{
- *rx_headroom = llc->rx_headroom;
- *rx_tailroom = llc->rx_tailroom;
-}
-
-inline int nfc_llc_start(struct nfc_llc *llc)
+int nfc_llc_start(struct nfc_llc *llc)
{
return llc->ops->start(llc);
}
EXPORT_SYMBOL(nfc_llc_start);
-inline int nfc_llc_stop(struct nfc_llc *llc)
+int nfc_llc_stop(struct nfc_llc *llc)
{
return llc->ops->stop(llc);
}
EXPORT_SYMBOL(nfc_llc_stop);
-inline void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb)
+void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb)
{
llc->ops->rcv_from_drv(llc, skb);
}
-inline int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb)
+int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb)
{
return llc->ops->xmit_from_hci(llc, skb);
}
-inline void *nfc_llc_get_data(struct nfc_llc *llc)
+void *nfc_llc_get_data(struct nfc_llc *llc)
{
return llc->data;
}
diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index 3425532c3..c5959ce50 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -438,19 +438,17 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock)
goto error_tlv;
}
- if (service_name_tlv != NULL)
- skb = llcp_add_tlv(skb, service_name_tlv,
- service_name_tlv_length);
-
- skb = llcp_add_tlv(skb, miux_tlv, miux_tlv_length);
- skb = llcp_add_tlv(skb, rw_tlv, rw_tlv_length);
+ llcp_add_tlv(skb, service_name_tlv, service_name_tlv_length);
+ llcp_add_tlv(skb, miux_tlv, miux_tlv_length);
+ llcp_add_tlv(skb, rw_tlv, rw_tlv_length);
skb_queue_tail(&local->tx_queue, skb);
- return 0;
+ err = 0;
error_tlv:
- pr_err("error %d\n", err);
+ if (err)
+ pr_err("error %d\n", err);
kfree(service_name_tlv);
kfree(miux_tlv);
@@ -493,15 +491,16 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock)
goto error_tlv;
}
- skb = llcp_add_tlv(skb, miux_tlv, miux_tlv_length);
- skb = llcp_add_tlv(skb, rw_tlv, rw_tlv_length);
+ llcp_add_tlv(skb, miux_tlv, miux_tlv_length);
+ llcp_add_tlv(skb, rw_tlv, rw_tlv_length);
skb_queue_tail(&local->tx_queue, skb);
- return 0;
+ err = 0;
error_tlv:
- pr_err("error %d\n", err);
+ if (err)
+ pr_err("error %d\n", err);
kfree(miux_tlv);
kfree(rw_tlv);
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index 98876274a..e69786c68 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -732,9 +732,8 @@ static void nfc_llcp_tx_work(struct work_struct *work)
int ret;
pr_debug("Sending pending skb\n");
- print_hex_dump(KERN_DEBUG, "LLCP Tx: ",
- DUMP_PREFIX_OFFSET, 16, 1,
- skb->data, skb->len, true);
+ print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->data, skb->len, true);
if (ptype == LLCP_PDU_DISC && sk != NULL &&
sk->sk_state == LLCP_DISCONNECTING) {
@@ -1412,8 +1411,8 @@ static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb)
pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap);
if (ptype != LLCP_PDU_SYMM)
- print_hex_dump(KERN_DEBUG, "LLCP Rx: ", DUMP_PREFIX_OFFSET,
- 16, 1, skb->data, skb->len, true);
+ print_hex_dump_debug("LLCP Rx: ", DUMP_PREFIX_OFFSET, 16, 1,
+ skb->data, skb->len, true);
switch (ptype) {
case LLCP_PDU_SYMM:
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 9a3eb7a0e..1ecbd7715 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -750,6 +750,14 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
if (likely(vport)) {
u16 mru = OVS_CB(skb)->mru;
+ u32 cutlen = OVS_CB(skb)->cutlen;
+
+ if (unlikely(cutlen > 0)) {
+ if (skb->len - cutlen > ETH_HLEN)
+ pskb_trim(skb, skb->len - cutlen);
+ else
+ pskb_trim(skb, ETH_HLEN);
+ }
if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
ovs_vport_send(vport, skb);
@@ -775,7 +783,8 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
static int output_userspace(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
- const struct nlattr *actions, int actions_len)
+ const struct nlattr *actions, int actions_len,
+ uint32_t cutlen)
{
struct dp_upcall_info upcall;
const struct nlattr *a;
@@ -822,7 +831,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
} /* End of switch. */
}
- return ovs_dp_upcall(dp, skb, key, &upcall);
+ return ovs_dp_upcall(dp, skb, key, &upcall, cutlen);
}
static int sample(struct datapath *dp, struct sk_buff *skb,
@@ -832,6 +841,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
const struct nlattr *acts_list = NULL;
const struct nlattr *a;
int rem;
+ u32 cutlen = 0;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
@@ -858,13 +868,24 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
return 0;
/* The only known usage of sample action is having a single user-space
+ * action, or having a truncate action followed by a single user-space
* action. Treat this usage as a special case.
* The output_userspace() should clone the skb to be sent to the
* user space. This skb will be consumed by its caller.
*/
+ if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) {
+ struct ovs_action_trunc *trunc = nla_data(a);
+
+ if (skb->len > trunc->max_len)
+ cutlen = skb->len - trunc->max_len;
+
+ a = nla_next(a, &rem);
+ }
+
if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
nla_is_last(a, rem)))
- return output_userspace(dp, skb, key, a, actions, actions_len);
+ return output_userspace(dp, skb, key, a, actions,
+ actions_len, cutlen);
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
@@ -1051,6 +1072,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
if (out_skb)
do_output(dp, out_skb, prev_port, key);
+ OVS_CB(skb)->cutlen = 0;
prev_port = -1;
}
@@ -1059,8 +1081,18 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
prev_port = nla_get_u32(a);
break;
+ case OVS_ACTION_ATTR_TRUNC: {
+ struct ovs_action_trunc *trunc = nla_data(a);
+
+ if (skb->len > trunc->max_len)
+ OVS_CB(skb)->cutlen = skb->len - trunc->max_len;
+ break;
+ }
+
case OVS_ACTION_ATTR_USERSPACE:
- output_userspace(dp, skb, key, a, attr, len);
+ output_userspace(dp, skb, key, a, attr,
+ len, OVS_CB(skb)->cutlen);
+ OVS_CB(skb)->cutlen = 0;
break;
case OVS_ACTION_ATTR_HASH:
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index d84312584..e054a748f 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -135,7 +135,7 @@ static void ovs_ct_get_labels(const struct nf_conn *ct,
struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
if (cl) {
- size_t len = cl->words * sizeof(long);
+ size_t len = sizeof(cl->bits);
if (len > OVS_CT_LABELS_LEN)
len = OVS_CT_LABELS_LEN;
@@ -274,7 +274,7 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
nf_ct_labels_ext_add(ct);
cl = nf_ct_labels_find(ct);
}
- if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN)
+ if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN)
return -ENOSPC;
err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
@@ -433,7 +433,6 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
- enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
unsigned int dataoff;
u8 protonum;
@@ -458,13 +457,8 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
ct = nf_ct_tuplehash_to_ctrack(h);
- ctinfo = ovs_ct_get_info(h);
- if (ctinfo == IP_CT_NEW) {
- /* This should not happen. */
- WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
- }
skb->nfct = &ct->ct_general;
- skb->nfctinfo = ctinfo;
+ skb->nfctinfo = ovs_ct_get_info(h);
return ct;
}
@@ -834,6 +828,17 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
return 0;
}
+static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
+{
+ size_t i;
+
+ for (i = 0; i < sizeof(*labels); i++)
+ if (labels->ct_labels[i])
+ return true;
+
+ return false;
+}
+
/* Lookup connection and confirm if unconfirmed. */
static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
@@ -844,24 +849,32 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
err = __ovs_ct_lookup(net, key, info, skb);
if (err)
return err;
- /* This is a no-op if the connection has already been confirmed. */
+
+ /* Apply changes before confirming the connection so that the initial
+ * conntrack NEW netlink event carries the values given in the CT
+ * action.
+ */
+ if (info->mark.mask) {
+ err = ovs_ct_set_mark(skb, key, info->mark.value,
+ info->mark.mask);
+ if (err)
+ return err;
+ }
+ if (labels_nonzero(&info->labels.mask)) {
+ err = ovs_ct_set_labels(skb, key, &info->labels.value,
+ &info->labels.mask);
+ if (err)
+ return err;
+ }
+ /* This will take care of sending queued events even if the connection
+ * is already confirmed.
+ */
if (nf_conntrack_confirm(skb) != NF_ACCEPT)
return -EINVAL;
return 0;
}
-static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
-{
- size_t i;
-
- for (i = 0; i < sizeof(*labels); i++)
- if (labels->ct_labels[i])
- return true;
-
- return false;
-}
-
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
* value if 'skb' is freed.
*/
@@ -886,19 +899,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
err = ovs_ct_commit(net, key, info, skb);
else
err = ovs_ct_lookup(net, key, info, skb);
- if (err)
- goto err;
- if (info->mark.mask) {
- err = ovs_ct_set_mark(skb, key, info->mark.value,
- info->mark.mask);
- if (err)
- goto err;
- }
- if (labels_nonzero(&info->labels.mask))
- err = ovs_ct_set_labels(skb, key, &info->labels.value,
- &info->labels.mask);
-err:
skb_push(skb, nh_ofs);
if (err)
kfree_skb(skb);
@@ -1155,6 +1156,20 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
}
}
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (!info->commit && info->mark.mask) {
+ OVS_NLERR(log,
+ "Setting conntrack mark requires 'commit' flag.");
+ return -EINVAL;
+ }
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ if (!info->commit && labels_nonzero(&info->labels.mask)) {
+ OVS_NLERR(log,
+ "Setting conntrack labels requires 'commit' flag.");
+ return -EINVAL;
+ }
+#endif
if (rem > 0) {
OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem);
return -EINVAL;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 856bd8dba..524c0fd30 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -137,10 +137,12 @@ EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
static struct vport *new_vport(const struct vport_parms *);
static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
const struct sw_flow_key *,
- const struct dp_upcall_info *);
+ const struct dp_upcall_info *,
+ uint32_t cutlen);
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
const struct sw_flow_key *,
- const struct dp_upcall_info *);
+ const struct dp_upcall_info *,
+ uint32_t cutlen);
/* Must be called with rcu_read_lock. */
static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
@@ -275,7 +277,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);
upcall.mru = OVS_CB(skb)->mru;
- error = ovs_dp_upcall(dp, skb, key, &upcall);
+ error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
if (unlikely(error))
kfree_skb(skb);
else
@@ -300,7 +302,8 @@ out:
int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
- const struct dp_upcall_info *upcall_info)
+ const struct dp_upcall_info *upcall_info,
+ uint32_t cutlen)
{
struct dp_stats_percpu *stats;
int err;
@@ -311,9 +314,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
}
if (!skb_is_gso(skb))
- err = queue_userspace_packet(dp, skb, key, upcall_info);
+ err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
else
- err = queue_gso_packets(dp, skb, key, upcall_info);
+ err = queue_gso_packets(dp, skb, key, upcall_info, cutlen);
if (err)
goto err;
@@ -331,7 +334,8 @@ err:
static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
- const struct dp_upcall_info *upcall_info)
+ const struct dp_upcall_info *upcall_info,
+ uint32_t cutlen)
{
unsigned short gso_type = skb_shinfo(skb)->gso_type;
struct sw_flow_key later_key;
@@ -360,7 +364,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
if (gso_type & SKB_GSO_UDP && skb != segs)
key = &later_key;
- err = queue_userspace_packet(dp, skb, key, upcall_info);
+ err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
if (err)
break;
@@ -383,7 +387,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
{
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
- + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
+ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
+ + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
/* OVS_PACKET_ATTR_USERDATA */
if (upcall_info->userdata)
@@ -416,7 +421,8 @@ static void pad_packet(struct datapath *dp, struct sk_buff *skb)
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
- const struct dp_upcall_info *upcall_info)
+ const struct dp_upcall_info *upcall_info,
+ uint32_t cutlen)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
@@ -461,7 +467,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
else
hlen = skb->len;
- len = upcall_msg_size(upcall_info, hlen);
+ len = upcall_msg_size(upcall_info, hlen - cutlen);
user_skb = genlmsg_new(len, GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
@@ -509,15 +515,25 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
pad_packet(dp, user_skb);
}
+ /* Add OVS_PACKET_ATTR_LEN when packet is truncated */
+ if (cutlen > 0) {
+ if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN,
+ skb->len)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+ pad_packet(dp, user_skb);
+ }
+
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy() */
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
err = -ENOBUFS;
goto out;
}
- nla->nla_len = nla_attr_size(skb->len);
+ nla->nla_len = nla_attr_size(skb->len - cutlen);
- err = skb_zerocopy(user_skb, skb, skb->len, hlen);
+ err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen);
if (err)
goto out;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 427e39a04..ab85c1cae 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -100,11 +100,13 @@ struct datapath {
* @input_vport: The original vport packet came in on. This value is cached
* when a packet is received by OVS.
* @mru: The maximum received fragement size; 0 if the packet is not
+ * @cutlen: The number of bytes from the packet end to be removed.
* fragmented.
*/
struct ovs_skb_cb {
struct vport *input_vport;
u16 mru;
+ u32 cutlen;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -194,7 +196,8 @@ extern struct genl_family dp_vport_genl_family;
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key);
void ovs_dp_detach_port(struct vport *);
int ovs_dp_upcall(struct datapath *, struct sk_buff *,
- const struct sw_flow_key *, const struct dp_upcall_info *);
+ const struct sw_flow_key *, const struct dp_upcall_info *,
+ uint32_t cutlen);
const char *ovs_dp_name(const struct datapath *dp);
struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 0bb650f4f..c78a6a147 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2229,6 +2229,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
[OVS_ACTION_ATTR_CT] = (u32)-1,
+ [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2255,6 +2256,14 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
return -EINVAL;
break;
+ case OVS_ACTION_ATTR_TRUNC: {
+ const struct ovs_action_trunc *trunc = nla_data(a);
+
+ if (trunc->max_len < ETH_HLEN)
+ return -EINVAL;
+ break;
+ }
+
case OVS_ACTION_ATTR_HASH: {
const struct ovs_action_hash *act_hash = nla_data(a);
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 1a1fcec88..5aaf3babf 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -93,7 +93,14 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- dev_change_flags(dev, dev->flags | IFF_UP);
+ err = dev_change_flags(dev, dev->flags | IFF_UP);
+ if (err < 0) {
+ rtnl_delete_link(dev);
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ goto error;
+ }
+
rtnl_unlock();
return vport;
error:
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 7f8897f33..0e72d95b0 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -54,6 +54,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
struct net *net = ovs_dp_get_net(parms->dp);
struct net_device *dev;
struct vport *vport;
+ int err;
vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms);
if (IS_ERR(vport))
@@ -67,9 +68,15 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- dev_change_flags(dev, dev->flags | IFF_UP);
- rtnl_unlock();
+ err = dev_change_flags(dev, dev->flags | IFF_UP);
+ if (err < 0) {
+ rtnl_delete_link(dev);
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ return ERR_PTR(err);
+ }
+ rtnl_unlock();
return vport;
}
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 2ee48e447..95c36147a 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -140,7 +140,7 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
{
- dev->needed_headroom = new_hr;
+ dev->needed_headroom = new_hr < 0 ? 0 : new_hr;
}
static const struct net_device_ops internal_dev_netdev_ops = {
@@ -195,7 +195,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
}
vport->dev = alloc_netdev(sizeof(struct internal_dev),
- parms->name, NET_NAME_UNKNOWN, do_setup);
+ parms->name, NET_NAME_USER, do_setup);
if (!vport->dev) {
err = -ENOMEM;
goto error_free_vport;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 5eb769434..7eb955e45 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -130,7 +130,14 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- dev_change_flags(dev, dev->flags | IFF_UP);
+ err = dev_change_flags(dev, dev->flags | IFF_UP);
+ if (err < 0) {
+ rtnl_delete_link(dev);
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ goto error;
+ }
+
rtnl_unlock();
return vport;
error:
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 31cbc8c5c..6b21fd068 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -444,6 +444,7 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
OVS_CB(skb)->input_vport = vport;
OVS_CB(skb)->mru = 0;
+ OVS_CB(skb)->cutlen = 0;
if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) {
u32 mark;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b43c4015b..33a4697d5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1588,13 +1588,9 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
if (copy_from_user(&fd, data, len))
return -EFAULT;
- new = bpf_prog_get(fd);
+ new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
if (IS_ERR(new))
return PTR_ERR(new);
- if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
- bpf_prog_put(new);
- return -EINVAL;
- }
__fanout_set_data_bpf(po->fanout, new);
return 0;
@@ -1977,40 +1973,8 @@ static int __packet_rcv_vnet(const struct sk_buff *skb,
{
*vnet_hdr = (const struct virtio_net_hdr) { 0 };
- if (skb_is_gso(skb)) {
- struct skb_shared_info *sinfo = skb_shinfo(skb);
-
- /* This is a hint as to how much should be linear. */
- vnet_hdr->hdr_len =
- __cpu_to_virtio16(vio_le(), skb_headlen(skb));
- vnet_hdr->gso_size =
- __cpu_to_virtio16(vio_le(), sinfo->gso_size);
-
- if (sinfo->gso_type & SKB_GSO_TCPV4)
- vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
- else if (sinfo->gso_type & SKB_GSO_TCPV6)
- vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
- else if (sinfo->gso_type & SKB_GSO_UDP)
- vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
- else if (sinfo->gso_type & SKB_GSO_FCOE)
- return -EINVAL;
- else
- BUG();
-
- if (sinfo->gso_type & SKB_GSO_TCP_ECN)
- vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
- } else
- vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
- vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(),
- skb_checksum_start_offset(skb));
- vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(),
- skb->csum_offset);
- } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
- vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
- } /* else everything is zero */
+ if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
+ BUG();
return 0;
}
diff --git a/net/rds/bind.c b/net/rds/bind.c
index b22ea9565..095f6ce58 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -81,6 +81,8 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
if (*port != 0) {
rover = be16_to_cpu(*port);
+ if (rover == RDS_FLAG_PROBE_PORT)
+ return -EINVAL;
last = rover;
} else {
rover = max_t(u16, prandom_u32(), 2);
@@ -91,12 +93,16 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
if (rover == 0)
rover++;
+ if (rover == RDS_FLAG_PROBE_PORT)
+ continue;
key = ((u64)addr << 32) | cpu_to_be16(rover);
if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
continue;
rs->rs_bound_key = key;
rs->rs_bound_addr = addr;
+ net_get_random_once(&rs->rs_hash_initval,
+ sizeof(rs->rs_hash_initval));
rs->rs_bound_port = cpu_to_be16(rover);
rs->rs_bound_node.next = NULL;
rds_sock_addref(rs);
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 6641bcf7c..8398fee7c 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -235,7 +235,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* therefore trigger warnings.
* Defer the xmit to rds_send_worker() instead.
*/
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(rds_wq,
+ &conn->c_path[0].cp_send_w, 0);
}
}
diff --git a/net/rds/connection.c b/net/rds/connection.c
index e3b118cae..f5058559b 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -95,14 +95,16 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
* and receiving over this connection again in the future. It is up to
* the transport to have serialized this call with its send and recv.
*/
-static void rds_conn_reset(struct rds_connection *conn)
+static void rds_conn_path_reset(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
+
rdsdebug("connection %pI4 to %pI4 reset\n",
&conn->c_laddr, &conn->c_faddr);
rds_stats_inc(s_conn_reset);
- rds_send_reset(conn);
- conn->c_flags = 0;
+ rds_send_path_reset(cp);
+ cp->cp_flags = 0;
/* Do not clear next_rx_seq here, else we cannot distinguish
* retransmitted packets from new packets, and will hand all
@@ -110,6 +112,32 @@ static void rds_conn_reset(struct rds_connection *conn)
* reliability guarantees of RDS. */
}
+static void __rds_conn_path_init(struct rds_connection *conn,
+ struct rds_conn_path *cp, bool is_outgoing)
+{
+ spin_lock_init(&cp->cp_lock);
+ cp->cp_next_tx_seq = 1;
+ init_waitqueue_head(&cp->cp_waitq);
+ INIT_LIST_HEAD(&cp->cp_send_queue);
+ INIT_LIST_HEAD(&cp->cp_retrans);
+
+ cp->cp_conn = conn;
+ atomic_set(&cp->cp_state, RDS_CONN_DOWN);
+ cp->cp_send_gen = 0;
+ /* cp_outgoing is per-path. So we can only set it here
+ * for the single-path transports.
+ */
+ if (!conn->c_trans->t_mp_capable)
+ cp->cp_outgoing = (is_outgoing ? 1 : 0);
+ cp->cp_reconnect_jiffies = 0;
+ INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
+ INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
+ INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
+ INIT_WORK(&cp->cp_down_w, rds_shutdown_worker);
+ mutex_init(&cp->cp_cm_lock);
+ cp->cp_flags = 0;
+}
+
/*
* There is only every one 'conn' for a given pair of addresses in the
* system at a time. They contain messages to be retransmitted and so
@@ -127,7 +155,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
struct rds_transport *loop_trans;
unsigned long flags;
- int ret;
+ int ret, i;
rcu_read_lock();
conn = rds_conn_lookup(net, head, laddr, faddr, trans);
@@ -153,13 +181,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
INIT_HLIST_NODE(&conn->c_hash_node);
conn->c_laddr = laddr;
conn->c_faddr = faddr;
- spin_lock_init(&conn->c_lock);
- conn->c_next_tx_seq = 1;
- rds_conn_net_set(conn, net);
- init_waitqueue_head(&conn->c_waitq);
- INIT_LIST_HEAD(&conn->c_send_queue);
- INIT_LIST_HEAD(&conn->c_retrans);
+ rds_conn_net_set(conn, net);
ret = rds_cong_get_maps(conn);
if (ret) {
@@ -188,6 +211,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
conn->c_trans = trans;
+ init_waitqueue_head(&conn->c_hs_waitq);
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ __rds_conn_path_init(conn, &conn->c_path[i],
+ is_outgoing);
+ conn->c_path[i].cp_index = i;
+ }
ret = trans->conn_alloc(conn, gfp);
if (ret) {
kmem_cache_free(rds_conn_slab, conn);
@@ -195,17 +224,6 @@ static struct rds_connection *__rds_conn_create(struct net *net,
goto out;
}
- atomic_set(&conn->c_state, RDS_CONN_DOWN);
- conn->c_send_gen = 0;
- conn->c_outgoing = (is_outgoing ? 1 : 0);
- conn->c_reconnect_jiffies = 0;
- INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
- INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
- INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
- INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
- mutex_init(&conn->c_cm_lock);
- conn->c_flags = 0;
-
rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
conn, &laddr, &faddr,
trans->t_name ? trans->t_name : "[unknown]",
@@ -222,7 +240,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
if (parent) {
/* Creating passive conn */
if (parent->c_passive) {
- trans->conn_free(conn->c_transport_data);
+ trans->conn_free(conn->c_path[0].cp_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = parent->c_passive;
} else {
@@ -236,7 +254,18 @@ static struct rds_connection *__rds_conn_create(struct net *net,
found = rds_conn_lookup(net, head, laddr, faddr, trans);
if (found) {
- trans->conn_free(conn->c_transport_data);
+ struct rds_conn_path *cp;
+ int i;
+
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ cp = &conn->c_path[i];
+ /* The ->conn_alloc invocation may have
+ * allocated resource for all paths, so all
+ * of them may have to be freed here.
+ */
+ if (cp->cp_transport_data)
+ trans->conn_free(cp->cp_transport_data);
+ }
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
@@ -267,10 +296,12 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net,
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
-void rds_conn_shutdown(struct rds_connection *conn)
+void rds_conn_shutdown(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
+
/* shut it down unless it's down already */
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
/*
* Quiesce the connection mgmt handlers before we start tearing
* things down. We don't hold the mutex for the entire
@@ -278,35 +309,38 @@ void rds_conn_shutdown(struct rds_connection *conn)
* deadlocking with the CM handler. Instead, the CM event
* handler is supposed to check for state DISCONNECTING
*/
- mutex_lock(&conn->c_cm_lock);
- if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
- && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
- rds_conn_error(conn, "shutdown called in state %d\n",
- atomic_read(&conn->c_state));
- mutex_unlock(&conn->c_cm_lock);
+ mutex_lock(&cp->cp_cm_lock);
+ if (!rds_conn_path_transition(cp, RDS_CONN_UP,
+ RDS_CONN_DISCONNECTING) &&
+ !rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_DISCONNECTING)) {
+ rds_conn_path_error(cp,
+ "shutdown called in state %d\n",
+ atomic_read(&cp->cp_state));
+ mutex_unlock(&cp->cp_cm_lock);
return;
}
- mutex_unlock(&conn->c_cm_lock);
+ mutex_unlock(&cp->cp_cm_lock);
- wait_event(conn->c_waitq,
- !test_bit(RDS_IN_XMIT, &conn->c_flags));
- wait_event(conn->c_waitq,
- !test_bit(RDS_RECV_REFILL, &conn->c_flags));
+ wait_event(cp->cp_waitq,
+ !test_bit(RDS_IN_XMIT, &cp->cp_flags));
+ wait_event(cp->cp_waitq,
+ !test_bit(RDS_RECV_REFILL, &cp->cp_flags));
- conn->c_trans->conn_shutdown(conn);
- rds_conn_reset(conn);
+ conn->c_trans->conn_path_shutdown(cp);
+ rds_conn_path_reset(cp);
- if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+ if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING,
+ RDS_CONN_DOWN)) {
/* This can happen - eg when we're in the middle of tearing
* down the connection, and someone unloads the rds module.
* Quite reproduceable with loopback connections.
* Mostly harmless.
*/
- rds_conn_error(conn,
- "%s: failed to transition to state DOWN, "
- "current state is %d\n",
- __func__,
- atomic_read(&conn->c_state));
+ rds_conn_path_error(cp, "%s: failed to transition "
+ "to state DOWN, current state "
+ "is %d\n", __func__,
+ atomic_read(&cp->cp_state));
return;
}
}
@@ -315,18 +349,47 @@ void rds_conn_shutdown(struct rds_connection *conn)
* The passive side of an IB loopback connection is never added
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
- cancel_delayed_work_sync(&conn->c_conn_w);
+ cancel_delayed_work_sync(&cp->cp_conn_w);
rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
- if (conn->c_trans->t_type != RDS_TRANS_TCP ||
- conn->c_outgoing == 1)
- rds_queue_reconnect(conn);
+ rds_queue_reconnect(cp);
} else {
rcu_read_unlock();
}
}
+/* destroy a single rds_conn_path. rds_conn_destroy() iterates over
+ * all paths using rds_conn_path_destroy()
+ */
+static void rds_conn_path_destroy(struct rds_conn_path *cp)
+{
+ struct rds_message *rm, *rtmp;
+
+ if (!cp->cp_transport_data)
+ return;
+
+ rds_conn_path_drop(cp);
+ flush_work(&cp->cp_down_w);
+
+ /* make sure lingering queued work won't try to ref the conn */
+ cancel_delayed_work_sync(&cp->cp_send_w);
+ cancel_delayed_work_sync(&cp->cp_recv_w);
+
+ /* tear down queued messages */
+ list_for_each_entry_safe(rm, rtmp,
+ &cp->cp_send_queue,
+ m_conn_item) {
+ list_del_init(&rm->m_conn_item);
+ BUG_ON(!list_empty(&rm->m_sock_item));
+ rds_message_put(rm);
+ }
+ if (cp->cp_xmit_rm)
+ rds_message_put(cp->cp_xmit_rm);
+
+ cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
+}
+
/*
* Stop and free a connection.
*
@@ -336,8 +399,9 @@ void rds_conn_shutdown(struct rds_connection *conn)
*/
void rds_conn_destroy(struct rds_connection *conn)
{
- struct rds_message *rm, *rtmp;
unsigned long flags;
+ int i;
+ struct rds_conn_path *cp;
rdsdebug("freeing conn %p for %pI4 -> "
"%pI4\n", conn, &conn->c_laddr,
@@ -350,25 +414,11 @@ void rds_conn_destroy(struct rds_connection *conn)
synchronize_rcu();
/* shut the connection down */
- rds_conn_drop(conn);
- flush_work(&conn->c_down_w);
-
- /* make sure lingering queued work won't try to ref the conn */
- cancel_delayed_work_sync(&conn->c_send_w);
- cancel_delayed_work_sync(&conn->c_recv_w);
-
- /* tear down queued messages */
- list_for_each_entry_safe(rm, rtmp,
- &conn->c_send_queue,
- m_conn_item) {
- list_del_init(&rm->m_conn_item);
- BUG_ON(!list_empty(&rm->m_sock_item));
- rds_message_put(rm);
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ cp = &conn->c_path[i];
+ rds_conn_path_destroy(cp);
+ BUG_ON(!list_empty(&cp->cp_retrans));
}
- if (conn->c_xmit_rm)
- rds_message_put(conn->c_xmit_rm);
-
- conn->c_trans->conn_free(conn->c_transport_data);
/*
* The congestion maps aren't freed up here. They're
@@ -377,7 +427,6 @@ void rds_conn_destroy(struct rds_connection *conn)
*/
rds_cong_remove_conn(conn);
- BUG_ON(!list_empty(&conn->c_retrans));
kmem_cache_free(rds_conn_slab, conn);
spin_lock_irqsave(&rds_conn_lock, flags);
@@ -398,6 +447,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
unsigned int total = 0;
unsigned long flags;
size_t i;
+ int j;
len /= sizeof(struct rds_info_message);
@@ -406,23 +456,32 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
- if (want_send)
- list = &conn->c_send_queue;
- else
- list = &conn->c_retrans;
-
- spin_lock_irqsave(&conn->c_lock, flags);
-
- /* XXX too lazy to maintain counts.. */
- list_for_each_entry(rm, list, m_conn_item) {
- total++;
- if (total <= len)
- rds_inc_info_copy(&rm->m_inc, iter,
- conn->c_laddr,
- conn->c_faddr, 0);
+ struct rds_conn_path *cp;
+
+ for (j = 0; j < RDS_MPATH_WORKERS; j++) {
+ cp = &conn->c_path[j];
+ if (want_send)
+ list = &cp->cp_send_queue;
+ else
+ list = &cp->cp_retrans;
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+
+ /* XXX too lazy to maintain counts.. */
+ list_for_each_entry(rm, list, m_conn_item) {
+ total++;
+ if (total <= len)
+ rds_inc_info_copy(&rm->m_inc,
+ iter,
+ conn->c_laddr,
+ conn->c_faddr,
+ 0);
+ }
+
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ if (!conn->c_trans->t_mp_capable)
+ break;
}
-
- spin_unlock_irqrestore(&conn->c_lock, flags);
}
}
rcu_read_unlock();
@@ -484,27 +543,72 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
}
EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
-static int rds_conn_info_visitor(struct rds_connection *conn,
- void *buffer)
+void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_conn_path *, void *),
+ size_t item_len)
+{
+ u64 buffer[(item_len + 7) / 8];
+ struct hlist_head *head;
+ struct rds_connection *conn;
+ size_t i;
+ int j;
+
+ rcu_read_lock();
+
+ lens->nr = 0;
+ lens->each = item_len;
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ struct rds_conn_path *cp;
+
+ for (j = 0; j < RDS_MPATH_WORKERS; j++) {
+ cp = &conn->c_path[j];
+
+ /* XXX no cp_lock usage.. */
+ if (!visitor(cp, buffer))
+ continue;
+ if (!conn->c_trans->t_mp_capable)
+ break;
+ }
+
+ /* We copy as much as we can fit in the buffer,
+ * but we count all items so that the caller
+ * can resize the buffer.
+ */
+ if (len >= item_len) {
+ rds_info_copy(iter, buffer, item_len);
+ len -= item_len;
+ }
+ lens->nr++;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
{
struct rds_info_connection *cinfo = buffer;
- cinfo->next_tx_seq = conn->c_next_tx_seq;
- cinfo->next_rx_seq = conn->c_next_rx_seq;
- cinfo->laddr = conn->c_laddr;
- cinfo->faddr = conn->c_faddr;
- strncpy(cinfo->transport, conn->c_trans->t_name,
+ cinfo->next_tx_seq = cp->cp_next_tx_seq;
+ cinfo->next_rx_seq = cp->cp_next_rx_seq;
+ cinfo->laddr = cp->cp_conn->c_laddr;
+ cinfo->faddr = cp->cp_conn->c_faddr;
+ strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name,
sizeof(cinfo->transport));
cinfo->flags = 0;
- rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+ rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
SENDING);
/* XXX Future: return the state rather than these funky bits */
rds_conn_info_set(cinfo->flags,
- atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
+ atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
CONNECTING);
rds_conn_info_set(cinfo->flags,
- atomic_read(&conn->c_state) == RDS_CONN_UP,
+ atomic_read(&cp->cp_state) == RDS_CONN_UP,
CONNECTED);
return 1;
}
@@ -513,7 +617,7 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
- rds_for_each_conn_info(sock, len, iter, lens,
+ rds_walk_conn_path_info(sock, len, iter, lens,
rds_conn_info_visitor,
sizeof(struct rds_info_connection));
}
@@ -553,10 +657,17 @@ void rds_conn_exit(void)
/*
* Force a disconnect
*/
+void rds_conn_path_drop(struct rds_conn_path *cp)
+{
+ atomic_set(&cp->cp_state, RDS_CONN_ERROR);
+ queue_work(rds_wq, &cp->cp_down_w);
+}
+EXPORT_SYMBOL_GPL(rds_conn_path_drop);
+
void rds_conn_drop(struct rds_connection *conn)
{
- atomic_set(&conn->c_state, RDS_CONN_ERROR);
- queue_work(rds_wq, &conn->c_down_w);
+ WARN_ON(conn->c_trans->t_mp_capable);
+ rds_conn_path_drop(&conn->c_path[0]);
}
EXPORT_SYMBOL_GPL(rds_conn_drop);
@@ -564,11 +675,17 @@ EXPORT_SYMBOL_GPL(rds_conn_drop);
* If the connection is down, trigger a connect. We may have scheduled a
* delayed reconnect however - in this case we should not interfere.
*/
+void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
+{
+ if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
+ !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
+ queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+}
+
void rds_conn_connect_if_down(struct rds_connection *conn)
{
- if (rds_conn_state(conn) == RDS_CONN_DOWN &&
- !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+ WARN_ON(conn->c_trans->t_mp_capable);
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
}
EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
@@ -586,3 +703,15 @@ __rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
rds_conn_drop(conn);
}
+
+void
+__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+
+ rds_conn_path_drop(cp);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b5342fdda..7eaf887e4 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -40,6 +40,7 @@
#include <linux/slab.h>
#include <linux/module.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
#include "ib_mr.h"
@@ -380,15 +381,15 @@ void rds_ib_exit(void)
struct rds_transport rds_ib_transport = {
.laddr_check = rds_ib_laddr_check,
- .xmit_complete = rds_ib_xmit_complete,
+ .xmit_path_complete = rds_ib_xmit_path_complete,
.xmit = rds_ib_xmit,
.xmit_rdma = rds_ib_xmit_rdma,
.xmit_atomic = rds_ib_xmit_atomic,
- .recv = rds_ib_recv,
+ .recv_path = rds_ib_recv_path,
.conn_alloc = rds_ib_conn_alloc,
.conn_free = rds_ib_conn_free,
- .conn_connect = rds_ib_conn_connect,
- .conn_shutdown = rds_ib_conn_shutdown,
+ .conn_path_connect = rds_ib_conn_path_connect,
+ .conn_path_shutdown = rds_ib_conn_path_shutdown,
.inc_copy_to_user = rds_ib_inc_copy_to_user,
.inc_free = rds_ib_inc_free,
.cm_initiate_connect = rds_ib_cm_initiate_connect,
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 627fb79ae..046f7508c 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -328,8 +328,8 @@ extern struct list_head ib_nodev_conns;
/* ib_cm.c */
int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
void rds_ib_conn_free(void *arg);
-int rds_ib_conn_connect(struct rds_connection *conn);
-void rds_ib_conn_shutdown(struct rds_connection *conn);
+int rds_ib_conn_path_connect(struct rds_conn_path *cp);
+void rds_ib_conn_path_shutdown(struct rds_conn_path *cp);
void rds_ib_state_change(struct sock *sk);
int rds_ib_listen_init(void);
void rds_ib_listen_stop(void);
@@ -354,7 +354,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
/* ib_recv.c */
int rds_ib_recv_init(void);
void rds_ib_recv_exit(void);
-int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_path(struct rds_conn_path *conn);
int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
@@ -384,7 +384,7 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
extern wait_queue_head_t rds_ib_ring_empty_wait;
/* ib_send.c */
-void rds_ib_xmit_complete(struct rds_connection *conn);
+void rds_ib_xmit_path_complete(struct rds_conn_path *cp);
int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 7c2a65a6a..5b2ab95af 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -36,6 +36,7 @@
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
@@ -273,7 +274,7 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
if (rds_conn_up(conn) &&
(!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued)))
- rds_send_xmit(ic->conn);
+ rds_send_xmit(&ic->conn->c_path[0]);
}
static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
@@ -684,8 +685,9 @@ out:
return ret;
}
-int rds_ib_conn_connect(struct rds_connection *conn)
+int rds_ib_conn_path_connect(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
struct sockaddr_in src, dest;
int ret;
@@ -730,8 +732,9 @@ out:
* so that it can be called at any point during startup. In fact it
* can be called multiple times for a given connection.
*/
-void rds_ib_conn_shutdown(struct rds_connection *conn)
+void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
int err = 0;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index f7164ac1f..977f69886 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -35,6 +35,7 @@
#include <linux/rculist.h>
#include <linux/llist.h>
+#include "rds_single_path.h"
#include "ib_mr.h"
struct workqueue_struct *rds_ib_mr_wq;
@@ -618,7 +619,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
int rds_ib_mr_init(void)
{
- rds_ib_mr_wq = create_workqueue("rds_mr_flushd");
+ rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0);
if (!rds_ib_mr_wq)
return -ENOMEM;
return 0;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index abc8cc805..606a11f68 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -36,6 +36,7 @@
#include <linux/dma-mapping.h>
#include <rdma/rdma_cm.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
@@ -1008,8 +1009,9 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
}
-int rds_ib_recv(struct rds_connection *conn)
+int rds_ib_recv_path(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
int ret = 0;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index f27d2c82b..84d90c973 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -36,6 +36,7 @@
#include <linux/dmapool.h>
#include <linux/ratelimit.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
@@ -979,8 +980,9 @@ out:
return ret;
}
-void rds_ib_xmit_complete(struct rds_connection *conn)
+void rds_ib_xmit_path_complete(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
/* We may have a pending ACK or window update we were unable
diff --git a/net/rds/loop.c b/net/rds/loop.c
index 814173b46..f2bf78de5 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -34,6 +34,7 @@
#include <linux/slab.h>
#include <linux/in.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "loop.h"
@@ -101,7 +102,7 @@ static void rds_loop_inc_free(struct rds_incoming *inc)
}
/* we need to at least give the thread something to succeed */
-static int rds_loop_recv(struct rds_connection *conn)
+static int rds_loop_recv_path(struct rds_conn_path *cp)
{
return 0;
}
@@ -149,13 +150,13 @@ static void rds_loop_conn_free(void *arg)
kfree(lc);
}
-static int rds_loop_conn_connect(struct rds_connection *conn)
+static int rds_loop_conn_path_connect(struct rds_conn_path *cp)
{
- rds_connect_complete(conn);
+ rds_connect_complete(cp->cp_conn);
return 0;
}
-static void rds_loop_conn_shutdown(struct rds_connection *conn)
+static void rds_loop_conn_path_shutdown(struct rds_conn_path *cp)
{
}
@@ -184,11 +185,11 @@ void rds_loop_exit(void)
*/
struct rds_transport rds_loop_transport = {
.xmit = rds_loop_xmit,
- .recv = rds_loop_recv,
+ .recv_path = rds_loop_recv_path,
.conn_alloc = rds_loop_conn_alloc,
.conn_free = rds_loop_conn_free,
- .conn_connect = rds_loop_conn_connect,
- .conn_shutdown = rds_loop_conn_shutdown,
+ .conn_path_connect = rds_loop_conn_path_connect,
+ .conn_path_shutdown = rds_loop_conn_path_shutdown,
.inc_copy_to_user = rds_message_inc_copy_to_user,
.inc_free = rds_loop_inc_free,
.t_name = "loopback",
diff --git a/net/rds/message.c b/net/rds/message.c
index 756c73729..6cb910615 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -41,6 +41,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+[RDS_EXTHDR_NPATHS] = sizeof(u16),
};
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 7220bebcf..345f09059 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -33,6 +33,7 @@
#include <linux/module.h>
#include <rdma/rdma_cm.h>
+#include "rds_single_path.h"
#include "rdma_transport.h"
#include "ib.h"
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 387df5f32..b2d17f0fa 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -84,56 +84,73 @@ enum {
#define RDS_IN_XMIT 2
#define RDS_RECV_REFILL 3
+/* Max number of multipaths per RDS connection. Must be a power of 2 */
+#define RDS_MPATH_WORKERS 8
+#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \
+ (rs)->rs_hash_initval) & ((n) - 1))
+
+/* Per mpath connection state */
+struct rds_conn_path {
+ struct rds_connection *cp_conn;
+ struct rds_message *cp_xmit_rm;
+ unsigned long cp_xmit_sg;
+ unsigned int cp_xmit_hdr_off;
+ unsigned int cp_xmit_data_off;
+ unsigned int cp_xmit_atomic_sent;
+ unsigned int cp_xmit_rdma_sent;
+ unsigned int cp_xmit_data_sent;
+
+ spinlock_t cp_lock; /* protect msg queues */
+ u64 cp_next_tx_seq;
+ struct list_head cp_send_queue;
+ struct list_head cp_retrans;
+
+ u64 cp_next_rx_seq;
+
+ void *cp_transport_data;
+
+ atomic_t cp_state;
+ unsigned long cp_send_gen;
+ unsigned long cp_flags;
+ unsigned long cp_reconnect_jiffies;
+ struct delayed_work cp_send_w;
+ struct delayed_work cp_recv_w;
+ struct delayed_work cp_conn_w;
+ struct work_struct cp_down_w;
+ struct mutex cp_cm_lock; /* protect cp_state & cm */
+ wait_queue_head_t cp_waitq;
+
+ unsigned int cp_unacked_packets;
+ unsigned int cp_unacked_bytes;
+ unsigned int cp_outgoing:1,
+ cp_pad_to_32:31;
+ unsigned int cp_index;
+};
+
+/* One rds_connection per RDS address pair */
struct rds_connection {
struct hlist_node c_hash_node;
__be32 c_laddr;
__be32 c_faddr;
unsigned int c_loopback:1,
- c_outgoing:1,
+ c_ping_triggered:1,
c_pad_to_32:30;
+ int c_npaths;
struct rds_connection *c_passive;
+ struct rds_transport *c_trans;
struct rds_cong_map *c_lcong;
struct rds_cong_map *c_fcong;
- struct rds_message *c_xmit_rm;
- unsigned long c_xmit_sg;
- unsigned int c_xmit_hdr_off;
- unsigned int c_xmit_data_off;
- unsigned int c_xmit_atomic_sent;
- unsigned int c_xmit_rdma_sent;
- unsigned int c_xmit_data_sent;
-
- spinlock_t c_lock; /* protect msg queues */
- u64 c_next_tx_seq;
- struct list_head c_send_queue;
- struct list_head c_retrans;
-
- u64 c_next_rx_seq;
-
- struct rds_transport *c_trans;
- void *c_transport_data;
-
- atomic_t c_state;
- unsigned long c_send_gen;
- unsigned long c_flags;
- unsigned long c_reconnect_jiffies;
- struct delayed_work c_send_w;
- struct delayed_work c_recv_w;
- struct delayed_work c_conn_w;
- struct work_struct c_down_w;
- struct mutex c_cm_lock; /* protect conn state & cm */
- wait_queue_head_t c_waitq;
+ /* Protocol version */
+ unsigned int c_version;
+ possible_net_t c_net;
struct list_head c_map_item;
unsigned long c_map_queued;
- unsigned int c_unacked_packets;
- unsigned int c_unacked_bytes;
-
- /* Protocol version */
- unsigned int c_version;
- possible_net_t c_net;
+ struct rds_conn_path c_path[RDS_MPATH_WORKERS];
+ wait_queue_head_t c_hs_waitq; /* handshake waitq */
};
static inline
@@ -153,6 +170,17 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net)
#define RDS_FLAG_RETRANSMITTED 0x04
#define RDS_MAX_ADV_CREDIT 255
+/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
+ * probe to exchange control information before establishing a connection.
+ * Currently the control information that is exchanged is the number of
+ * supported paths. If the peer is a legacy (older kernel revision) peer,
+ * it would return a pong message without additional control information
+ * that would then alert the sender that the peer was an older rev.
+ */
+#define RDS_FLAG_PROBE_PORT 1
+#define RDS_HS_PROBE(sport, dport) \
+ ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \
+ (sport == 0 && dport == RDS_FLAG_PROBE_PORT))
/*
* Maximum space available for extension headers.
*/
@@ -212,12 +240,18 @@ struct rds_ext_header_rdma_dest {
__be32 h_rdma_offset;
};
+/* Extension header announcing number of paths.
+ * Implicit length = 2 bytes.
+ */
+#define RDS_EXTHDR_NPATHS 4
+
#define __RDS_EXTHDR_MAX 16 /* for now */
struct rds_incoming {
atomic_t i_refcount;
struct list_head i_item;
struct rds_connection *i_conn;
+ struct rds_conn_path *i_conn_path;
struct rds_header i_hdr;
unsigned long i_rx_jiffies;
__be32 i_saddr;
@@ -433,21 +467,22 @@ struct rds_transport {
char t_name[TRANSNAMSIZ];
struct list_head t_item;
struct module *t_owner;
- unsigned int t_prefer_loopback:1;
+ unsigned int t_prefer_loopback:1,
+ t_mp_capable:1;
unsigned int t_type;
int (*laddr_check)(struct net *net, __be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
- int (*conn_connect)(struct rds_connection *conn);
- void (*conn_shutdown)(struct rds_connection *conn);
- void (*xmit_prepare)(struct rds_connection *conn);
- void (*xmit_complete)(struct rds_connection *conn);
+ int (*conn_path_connect)(struct rds_conn_path *cp);
+ void (*conn_path_shutdown)(struct rds_conn_path *conn);
+ void (*xmit_path_prepare)(struct rds_conn_path *cp);
+ void (*xmit_path_complete)(struct rds_conn_path *cp);
int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
- int (*recv)(struct rds_connection *conn);
+ int (*recv_path)(struct rds_conn_path *cp);
int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
void (*inc_free)(struct rds_incoming *inc);
@@ -530,6 +565,7 @@ struct rds_sock {
/* Socket options - in case there will be more */
unsigned char rs_recverr,
rs_cong_monitor;
+ u32 rs_hash_initval;
};
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -636,10 +672,12 @@ struct rds_connection *rds_conn_create(struct net *net,
struct rds_connection *rds_conn_create_outgoing(struct net *net,
__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
-void rds_conn_shutdown(struct rds_connection *conn);
+void rds_conn_shutdown(struct rds_conn_path *cpath);
void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_path_drop(struct rds_conn_path *cpath);
void rds_conn_connect_if_down(struct rds_connection *conn);
+void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
@@ -650,28 +688,60 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...);
#define rds_conn_error(conn, fmt...) \
__rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
+#define rds_conn_path_error(cp, fmt...) \
+ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_path_transition(struct rds_conn_path *cp, int old, int new)
+{
+ return atomic_cmpxchg(&cp->cp_state, old, new) == old;
+}
+
static inline int
rds_conn_transition(struct rds_connection *conn, int old, int new)
{
- return atomic_cmpxchg(&conn->c_state, old, new) == old;
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_transition(&conn->c_path[0], old, new);
+}
+
+static inline int
+rds_conn_path_state(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state);
}
static inline int
rds_conn_state(struct rds_connection *conn)
{
- return atomic_read(&conn->c_state);
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_state(&conn->c_path[0]);
+}
+
+static inline int
+rds_conn_path_up(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_UP;
}
static inline int
rds_conn_up(struct rds_connection *conn)
{
- return atomic_read(&conn->c_state) == RDS_CONN_UP;
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_up(&conn->c_path[0]);
+}
+
+static inline int
+rds_conn_path_connecting(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING;
}
static inline int
rds_conn_connecting(struct rds_connection *conn)
{
- return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_connecting(&conn->c_path[0]);
}
/* message.c */
@@ -720,6 +790,8 @@ void rds_page_exit(void);
/* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr);
+void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
+ __be32 saddr);
void rds_inc_put(struct rds_incoming *inc);
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_incoming *inc, gfp_t gfp);
@@ -733,16 +805,16 @@ void rds_inc_info_copy(struct rds_incoming *inc,
/* send.c */
int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
-void rds_send_reset(struct rds_connection *conn);
-int rds_send_xmit(struct rds_connection *conn);
+void rds_send_path_reset(struct rds_conn_path *conn);
+int rds_send_xmit(struct rds_conn_path *cp);
struct sockaddr_in;
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked);
-int rds_send_pong(struct rds_connection *conn, __be16 dport);
-struct rds_message *rds_send_get_message(struct rds_connection *,
- struct rm_rdma_op *);
+void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
+ is_acked_func is_acked);
+int rds_send_pong(struct rds_conn_path *cp, __be16 dport);
/* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
@@ -809,12 +881,12 @@ extern unsigned int rds_sysctl_trace_level;
int rds_threads_init(void);
void rds_threads_exit(void);
extern struct workqueue_struct *rds_wq;
-void rds_queue_reconnect(struct rds_connection *conn);
+void rds_queue_reconnect(struct rds_conn_path *cp);
void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
-void rds_connect_path_complete(struct rds_connection *conn, int curr);
+void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
diff --git a/net/rds/rds_single_path.h b/net/rds/rds_single_path.h
new file mode 100644
index 000000000..e1241af7c
--- /dev/null
+++ b/net/rds/rds_single_path.h
@@ -0,0 +1,30 @@
+#ifndef _RDS_RDS_SINGLE_H
+#define _RDS_RDS_SINGLE_H
+
+#define c_xmit_rm c_path[0].cp_xmit_rm
+#define c_xmit_sg c_path[0].cp_xmit_sg
+#define c_xmit_hdr_off c_path[0].cp_xmit_hdr_off
+#define c_xmit_data_off c_path[0].cp_xmit_data_off
+#define c_xmit_atomic_sent c_path[0].cp_xmit_atomic_sent
+#define c_xmit_rdma_sent c_path[0].cp_xmit_rdma_sent
+#define c_xmit_data_sent c_path[0].cp_xmit_data_sent
+#define c_lock c_path[0].cp_lock
+#define c_next_tx_seq c_path[0].cp_next_tx_seq
+#define c_send_queue c_path[0].cp_send_queue
+#define c_retrans c_path[0].cp_retrans
+#define c_next_rx_seq c_path[0].cp_next_rx_seq
+#define c_transport_data c_path[0].cp_transport_data
+#define c_state c_path[0].cp_state
+#define c_send_gen c_path[0].cp_send_gen
+#define c_flags c_path[0].cp_flags
+#define c_reconnect_jiffies c_path[0].cp_reconnect_jiffies
+#define c_send_w c_path[0].cp_send_w
+#define c_recv_w c_path[0].cp_recv_w
+#define c_conn_w c_path[0].cp_conn_w
+#define c_down_w c_path[0].cp_down_w
+#define c_cm_lock c_path[0].cp_cm_lock
+#define c_waitq c_path[0].cp_waitq
+#define c_unacked_packets c_path[0].cp_unacked_packets
+#define c_unacked_bytes c_path[0].cp_unacked_bytes
+
+#endif /* _RDS_RDS_SINGLE_H */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 8413f6c99..cbfabdf3f 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -53,6 +53,20 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
}
EXPORT_SYMBOL_GPL(rds_inc_init);
+void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
+ __be32 saddr)
+{
+ atomic_set(&inc->i_refcount, 1);
+ INIT_LIST_HEAD(&inc->i_item);
+ inc->i_conn = cp->cp_conn;
+ inc->i_conn_path = cp;
+ inc->i_saddr = saddr;
+ inc->i_rdma_cookie = 0;
+ inc->i_rx_tstamp.tv_sec = 0;
+ inc->i_rx_tstamp.tv_usec = 0;
+}
+EXPORT_SYMBOL_GPL(rds_inc_path_init);
+
static void rds_inc_addref(struct rds_incoming *inc)
{
rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
@@ -142,6 +156,67 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock
}
}
+static void rds_recv_hs_exthdrs(struct rds_header *hdr,
+ struct rds_connection *conn)
+{
+ unsigned int pos = 0, type, len;
+ union {
+ struct rds_ext_header_version version;
+ u16 rds_npaths;
+ } buffer;
+
+ while (1) {
+ len = sizeof(buffer);
+ type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+ if (type == RDS_EXTHDR_NONE)
+ break;
+ /* Process extension header here */
+ switch (type) {
+ case RDS_EXTHDR_NPATHS:
+ conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
+ buffer.rds_npaths);
+ break;
+ default:
+ pr_warn_ratelimited("ignoring unknown exthdr type "
+ "0x%x\n", type);
+ }
+ }
+ /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
+ conn->c_npaths = max_t(int, conn->c_npaths, 1);
+}
+
+/* rds_start_mprds() will synchronously start multiple paths when appropriate.
+ * The scheme is based on the following rules:
+ *
+ * 1. rds_sendmsg on first connect attempt sends the probe ping, with the
+ * sender's npaths (s_npaths)
+ * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It
+ * sends back a probe-pong with r_npaths. After that, if rcvr is the
+ * smaller ip addr, it starts rds_conn_path_connect_if_down on all
+ * mprds_paths.
+ * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down.
+ * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be
+ * called after reception of the probe-pong on all mprds_paths.
+ * Otherwise (sender of probe-ping is not the smaller ip addr): just call
+ * rds_conn_path_connect_if_down on the hashed path. (see rule 4)
+ * 4. when cp_index > 0, rds_connect_worker must only trigger
+ * a connection if laddr < faddr.
+ * 5. sender may end up queuing the packet on the cp. will get sent out later.
+ * when connection is completed.
+ */
+static void rds_start_mprds(struct rds_connection *conn)
+{
+ int i;
+ struct rds_conn_path *cp;
+
+ if (conn->c_npaths > 1 && conn->c_laddr < conn->c_faddr) {
+ for (i = 1; i < conn->c_npaths; i++) {
+ cp = &conn->c_path[i];
+ rds_conn_path_connect_if_down(cp);
+ }
+ }
+}
+
/*
* The transport must make sure that this is serialized against other
* rx and conn reset on this specific conn.
@@ -164,13 +239,18 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_sock *rs = NULL;
struct sock *sk;
unsigned long flags;
+ struct rds_conn_path *cp;
inc->i_conn = conn;
inc->i_rx_jiffies = jiffies;
+ if (conn->c_trans->t_mp_capable)
+ cp = inc->i_conn_path;
+ else
+ cp = &conn->c_path[0];
rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
"flags 0x%x rx_jiffies %lu\n", conn,
- (unsigned long long)conn->c_next_rx_seq,
+ (unsigned long long)cp->cp_next_rx_seq,
inc,
(unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
be32_to_cpu(inc->i_hdr.h_len),
@@ -199,16 +279,34 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
* XXX we could spend more on the wire to get more robust failure
* detection, arguably worth it to avoid data corruption.
*/
- if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
+ if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
(inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
rds_stats_inc(s_recv_drop_old_seq);
goto out;
}
- conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+ cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+ if (inc->i_hdr.h_sport == 0) {
+ rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
+ goto out;
+ }
rds_stats_inc(s_recv_ping);
- rds_send_pong(conn, inc->i_hdr.h_sport);
+ rds_send_pong(cp, inc->i_hdr.h_sport);
+ /* if this is a handshake ping, start multipath if necessary */
+ if (RDS_HS_PROBE(inc->i_hdr.h_sport, inc->i_hdr.h_dport)) {
+ rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
+ rds_start_mprds(cp->cp_conn);
+ }
+ goto out;
+ }
+
+ if (inc->i_hdr.h_dport == RDS_FLAG_PROBE_PORT &&
+ inc->i_hdr.h_sport == 0) {
+ rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
+ /* if this is a handshake pong, start multipath if necessary */
+ rds_start_mprds(cp->cp_conn);
+ wake_up(&cp->cp_conn->c_hs_waitq);
goto out;
}
diff --git a/net/rds/send.c b/net/rds/send.c
index b1962f8e3..896626b9a 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -62,14 +62,14 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status);
* Reset the send state. Callers must ensure that this doesn't race with
* rds_send_xmit().
*/
-void rds_send_reset(struct rds_connection *conn)
+void rds_send_path_reset(struct rds_conn_path *cp)
{
struct rds_message *rm, *tmp;
unsigned long flags;
- if (conn->c_xmit_rm) {
- rm = conn->c_xmit_rm;
- conn->c_xmit_rm = NULL;
+ if (cp->cp_xmit_rm) {
+ rm = cp->cp_xmit_rm;
+ cp->cp_xmit_rm = NULL;
/* Tell the user the RDMA op is no longer mapped by the
* transport. This isn't entirely true (it's flushed out
* independently) but as the connection is down, there's
@@ -78,37 +78,37 @@ void rds_send_reset(struct rds_connection *conn)
rds_message_put(rm);
}
- conn->c_xmit_sg = 0;
- conn->c_xmit_hdr_off = 0;
- conn->c_xmit_data_off = 0;
- conn->c_xmit_atomic_sent = 0;
- conn->c_xmit_rdma_sent = 0;
- conn->c_xmit_data_sent = 0;
+ cp->cp_xmit_sg = 0;
+ cp->cp_xmit_hdr_off = 0;
+ cp->cp_xmit_data_off = 0;
+ cp->cp_xmit_atomic_sent = 0;
+ cp->cp_xmit_rdma_sent = 0;
+ cp->cp_xmit_data_sent = 0;
- conn->c_map_queued = 0;
+ cp->cp_conn->c_map_queued = 0;
- conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
- conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+ cp->cp_unacked_packets = rds_sysctl_max_unacked_packets;
+ cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes;
/* Mark messages as retransmissions, and move them to the send q */
- spin_lock_irqsave(&conn->c_lock, flags);
- list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
}
- list_splice_init(&conn->c_retrans, &conn->c_send_queue);
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ list_splice_init(&cp->cp_retrans, &cp->cp_send_queue);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
}
-EXPORT_SYMBOL_GPL(rds_send_reset);
+EXPORT_SYMBOL_GPL(rds_send_path_reset);
-static int acquire_in_xmit(struct rds_connection *conn)
+static int acquire_in_xmit(struct rds_conn_path *cp)
{
- return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+ return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0;
}
-static void release_in_xmit(struct rds_connection *conn)
+static void release_in_xmit(struct rds_conn_path *cp)
{
- clear_bit(RDS_IN_XMIT, &conn->c_flags);
+ clear_bit(RDS_IN_XMIT, &cp->cp_flags);
smp_mb__after_atomic();
/*
* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
@@ -116,8 +116,8 @@ static void release_in_xmit(struct rds_connection *conn)
* the system-wide hashed waitqueue buckets in the fast path only to
* almost never find waiters.
*/
- if (waitqueue_active(&conn->c_waitq))
- wake_up_all(&conn->c_waitq);
+ if (waitqueue_active(&cp->cp_waitq))
+ wake_up_all(&cp->cp_waitq);
}
/*
@@ -134,8 +134,9 @@ static void release_in_xmit(struct rds_connection *conn)
* - small message latency is higher behind queued large messages
* - large message latency isn't starved by intervening small sends
*/
-int rds_send_xmit(struct rds_connection *conn)
+int rds_send_xmit(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_message *rm;
unsigned long flags;
unsigned int tmp;
@@ -155,7 +156,7 @@ restart:
* avoids blocking the caller and trading per-connection data between
* caches per message.
*/
- if (!acquire_in_xmit(conn)) {
+ if (!acquire_in_xmit(cp)) {
rds_stats_inc(s_send_lock_contention);
ret = -ENOMEM;
goto out;
@@ -169,21 +170,21 @@ restart:
* The acquire_in_xmit() check above ensures that only one
* caller can increment c_send_gen at any time.
*/
- conn->c_send_gen++;
- send_gen = conn->c_send_gen;
+ cp->cp_send_gen++;
+ send_gen = cp->cp_send_gen;
/*
* rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
* we do the opposite to avoid races.
*/
- if (!rds_conn_up(conn)) {
- release_in_xmit(conn);
+ if (!rds_conn_path_up(cp)) {
+ release_in_xmit(cp);
ret = 0;
goto out;
}
- if (conn->c_trans->xmit_prepare)
- conn->c_trans->xmit_prepare(conn);
+ if (conn->c_trans->xmit_path_prepare)
+ conn->c_trans->xmit_path_prepare(cp);
/*
* spin trying to push headers and data down the connection until
@@ -191,7 +192,7 @@ restart:
*/
while (1) {
- rm = conn->c_xmit_rm;
+ rm = cp->cp_xmit_rm;
/*
* If between sending messages, we can send a pending congestion
@@ -204,14 +205,16 @@ restart:
break;
}
rm->data.op_active = 1;
+ rm->m_inc.i_conn_path = cp;
+ rm->m_inc.i_conn = cp->cp_conn;
- conn->c_xmit_rm = rm;
+ cp->cp_xmit_rm = rm;
}
/*
* If not already working on one, grab the next message.
*
- * c_xmit_rm holds a ref while we're sending this message down
+ * cp_xmit_rm holds a ref while we're sending this message down
* the connction. We can use this ref while holding the
* send_sem.. rds_send_reset() is serialized with it.
*/
@@ -228,10 +231,10 @@ restart:
if (batch_count >= send_batch_count)
goto over_batch;
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock_irqsave(&cp->cp_lock, flags);
- if (!list_empty(&conn->c_send_queue)) {
- rm = list_entry(conn->c_send_queue.next,
+ if (!list_empty(&cp->cp_send_queue)) {
+ rm = list_entry(cp->cp_send_queue.next,
struct rds_message,
m_conn_item);
rds_message_addref(rm);
@@ -240,10 +243,11 @@ restart:
* Move the message from the send queue to the retransmit
* list right away.
*/
- list_move_tail(&rm->m_conn_item, &conn->c_retrans);
+ list_move_tail(&rm->m_conn_item,
+ &cp->cp_retrans);
}
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
if (!rm)
break;
@@ -257,32 +261,34 @@ restart:
*/
if (rm->rdma.op_active &&
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock_irqsave(&cp->cp_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped);
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
continue;
}
/* Require an ACK every once in a while */
len = ntohl(rm->m_inc.i_hdr.h_len);
- if (conn->c_unacked_packets == 0 ||
- conn->c_unacked_bytes < len) {
+ if (cp->cp_unacked_packets == 0 ||
+ cp->cp_unacked_bytes < len) {
__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
- conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
- conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+ cp->cp_unacked_packets =
+ rds_sysctl_max_unacked_packets;
+ cp->cp_unacked_bytes =
+ rds_sysctl_max_unacked_bytes;
rds_stats_inc(s_send_ack_required);
} else {
- conn->c_unacked_bytes -= len;
- conn->c_unacked_packets--;
+ cp->cp_unacked_bytes -= len;
+ cp->cp_unacked_packets--;
}
- conn->c_xmit_rm = rm;
+ cp->cp_xmit_rm = rm;
}
/* The transport either sends the whole rdma or none of it */
- if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+ if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) {
rm->m_final_op = &rm->rdma;
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue
@@ -294,11 +300,11 @@ restart:
wake_up_interruptible(&rm->m_flush_wait);
break;
}
- conn->c_xmit_rdma_sent = 1;
+ cp->cp_xmit_rdma_sent = 1;
}
- if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+ if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) {
rm->m_final_op = &rm->atomic;
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue
@@ -310,7 +316,7 @@ restart:
wake_up_interruptible(&rm->m_flush_wait);
break;
}
- conn->c_xmit_atomic_sent = 1;
+ cp->cp_xmit_atomic_sent = 1;
}
@@ -336,41 +342,42 @@ restart:
rm->data.op_active = 0;
}
- if (rm->data.op_active && !conn->c_xmit_data_sent) {
+ if (rm->data.op_active && !cp->cp_xmit_data_sent) {
rm->m_final_op = &rm->data;
+
ret = conn->c_trans->xmit(conn, rm,
- conn->c_xmit_hdr_off,
- conn->c_xmit_sg,
- conn->c_xmit_data_off);
+ cp->cp_xmit_hdr_off,
+ cp->cp_xmit_sg,
+ cp->cp_xmit_data_off);
if (ret <= 0)
break;
- if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
+ if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) {
tmp = min_t(int, ret,
sizeof(struct rds_header) -
- conn->c_xmit_hdr_off);
- conn->c_xmit_hdr_off += tmp;
+ cp->cp_xmit_hdr_off);
+ cp->cp_xmit_hdr_off += tmp;
ret -= tmp;
}
- sg = &rm->data.op_sg[conn->c_xmit_sg];
+ sg = &rm->data.op_sg[cp->cp_xmit_sg];
while (ret) {
tmp = min_t(int, ret, sg->length -
- conn->c_xmit_data_off);
- conn->c_xmit_data_off += tmp;
+ cp->cp_xmit_data_off);
+ cp->cp_xmit_data_off += tmp;
ret -= tmp;
- if (conn->c_xmit_data_off == sg->length) {
- conn->c_xmit_data_off = 0;
+ if (cp->cp_xmit_data_off == sg->length) {
+ cp->cp_xmit_data_off = 0;
sg++;
- conn->c_xmit_sg++;
- BUG_ON(ret != 0 &&
- conn->c_xmit_sg == rm->data.op_nents);
+ cp->cp_xmit_sg++;
+ BUG_ON(ret != 0 && cp->cp_xmit_sg ==
+ rm->data.op_nents);
}
}
- if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
- (conn->c_xmit_sg == rm->data.op_nents))
- conn->c_xmit_data_sent = 1;
+ if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) &&
+ (cp->cp_xmit_sg == rm->data.op_nents))
+ cp->cp_xmit_data_sent = 1;
}
/*
@@ -378,23 +385,23 @@ restart:
* if there is a data op. Thus, if the data is sent (or there was
* none), then we're done with the rm.
*/
- if (!rm->data.op_active || conn->c_xmit_data_sent) {
- conn->c_xmit_rm = NULL;
- conn->c_xmit_sg = 0;
- conn->c_xmit_hdr_off = 0;
- conn->c_xmit_data_off = 0;
- conn->c_xmit_rdma_sent = 0;
- conn->c_xmit_atomic_sent = 0;
- conn->c_xmit_data_sent = 0;
+ if (!rm->data.op_active || cp->cp_xmit_data_sent) {
+ cp->cp_xmit_rm = NULL;
+ cp->cp_xmit_sg = 0;
+ cp->cp_xmit_hdr_off = 0;
+ cp->cp_xmit_data_off = 0;
+ cp->cp_xmit_rdma_sent = 0;
+ cp->cp_xmit_atomic_sent = 0;
+ cp->cp_xmit_data_sent = 0;
rds_message_put(rm);
}
}
over_batch:
- if (conn->c_trans->xmit_complete)
- conn->c_trans->xmit_complete(conn);
- release_in_xmit(conn);
+ if (conn->c_trans->xmit_path_complete)
+ conn->c_trans->xmit_path_complete(cp);
+ release_in_xmit(cp);
/* Nuke any messages we decided not to retransmit. */
if (!list_empty(&to_be_dropped)) {
@@ -422,12 +429,12 @@ over_batch:
if (ret == 0) {
smp_mb();
if ((test_bit(0, &conn->c_map_queued) ||
- !list_empty(&conn->c_send_queue)) &&
- send_gen == conn->c_send_gen) {
+ !list_empty(&cp->cp_send_queue)) &&
+ send_gen == cp->cp_send_gen) {
rds_stats_inc(s_send_lock_queue_raced);
if (batch_count < send_batch_count)
goto restart;
- queue_delayed_work(rds_wq, &conn->c_send_w, 1);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
}
}
out:
@@ -560,42 +567,6 @@ __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
}
/*
- * This is called from the IB send completion when we detect
- * a RDMA operation that failed with remote access error.
- * So speed is not an issue here.
- */
-struct rds_message *rds_send_get_message(struct rds_connection *conn,
- struct rm_rdma_op *op)
-{
- struct rds_message *rm, *tmp, *found = NULL;
- unsigned long flags;
-
- spin_lock_irqsave(&conn->c_lock, flags);
-
- list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
- if (&rm->rdma == op) {
- atomic_inc(&rm->m_refcount);
- found = rm;
- goto out;
- }
- }
-
- list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
- if (&rm->rdma == op) {
- atomic_inc(&rm->m_refcount);
- found = rm;
- break;
- }
- }
-
-out:
- spin_unlock_irqrestore(&conn->c_lock, flags);
-
- return found;
-}
-EXPORT_SYMBOL_GPL(rds_send_get_message);
-
-/*
* This removes messages from the socket's list if they're on it. The list
* argument must be private to the caller, we must be able to modify it
* without locks. The messages must have a reference held for their
@@ -685,16 +656,16 @@ unlock_and_drop:
* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
* checks the RDS_MSG_HAS_ACK_SEQ bit.
*/
-void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
- is_acked_func is_acked)
+void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
+ is_acked_func is_acked)
{
struct rds_message *rm, *tmp;
unsigned long flags;
LIST_HEAD(list);
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock_irqsave(&cp->cp_lock, flags);
- list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
if (!rds_send_is_acked(rm, ack, is_acked))
break;
@@ -706,17 +677,26 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
if (!list_empty(&list))
smp_mb__after_atomic();
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
/* now remove the messages from the sock list as needed */
rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
}
+EXPORT_SYMBOL_GPL(rds_send_path_drop_acked);
+
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+ is_acked_func is_acked)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked);
+}
EXPORT_SYMBOL_GPL(rds_send_drop_acked);
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{
struct rds_message *rm, *tmp;
struct rds_connection *conn;
+ struct rds_conn_path *cp;
unsigned long flags;
LIST_HEAD(list);
@@ -745,22 +725,26 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
list_for_each_entry(rm, &list, m_sock_item) {
conn = rm->m_inc.i_conn;
+ if (conn->c_trans->t_mp_capable)
+ cp = rm->m_inc.i_conn_path;
+ else
+ cp = &conn->c_path[0];
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock_irqsave(&cp->cp_lock, flags);
/*
* Maybe someone else beat us to removing rm from the conn.
* If we race with their flag update we'll get the lock and
* then really see that the flag has been cleared.
*/
if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
spin_lock_irqsave(&rm->m_rs_lock, flags);
rm->m_rs = NULL;
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
continue;
}
list_del_init(&rm->m_conn_item);
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
/*
* Couldn't grab m_rs_lock in top loop (lock ordering),
@@ -809,6 +793,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
* message from the flow with RDS_CANCEL_SENT_TO.
*/
static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+ struct rds_conn_path *cp,
struct rds_message *rm, __be16 sport,
__be16 dport, int *queued)
{
@@ -852,13 +837,14 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
trying to minimize the time we hold c_lock */
rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
rm->m_inc.i_conn = conn;
+ rm->m_inc.i_conn_path = cp;
rds_message_addref(rm);
- spin_lock(&conn->c_lock);
- rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
- list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ spin_lock(&cp->cp_lock);
+ rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++);
+ list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
- spin_unlock(&conn->c_lock);
+ spin_unlock(&cp->cp_lock);
rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
rm, len, rs, rs->rs_snd_bytes,
@@ -977,6 +963,29 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
return ret;
}
+static void rds_send_ping(struct rds_connection *conn);
+
+static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
+{
+ int hash;
+
+ if (conn->c_npaths == 0)
+ hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
+ else
+ hash = RDS_MPATH_HASH(rs, conn->c_npaths);
+ if (conn->c_npaths == 0 && hash != 0) {
+ rds_send_ping(conn);
+
+ if (conn->c_npaths == 0) {
+ wait_event_interruptible(conn->c_hs_waitq,
+ (conn->c_npaths != 0));
+ }
+ if (conn->c_npaths == 1)
+ hash = 0;
+ }
+ return hash;
+}
+
int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
{
struct sock *sk = sock->sk;
@@ -990,6 +999,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
int queued = 0, allocated_mr = 0;
int nonblock = msg->msg_flags & MSG_DONTWAIT;
long timeo = sock_sndtimeo(sk, nonblock);
+ struct rds_conn_path *cpath;
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
@@ -1088,15 +1098,19 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
goto out;
}
- rds_conn_connect_if_down(conn);
+ if (conn->c_trans->t_mp_capable)
+ cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
+ else
+ cpath = &conn->c_path[0];
+
+ rds_conn_path_connect_if_down(cpath);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
if (ret) {
rs->rs_seen_congestion = 1;
goto out;
}
-
- while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+ while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port,
dport, &queued)) {
rds_stats_inc(s_send_queue_full);
@@ -1106,7 +1120,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
}
timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
- rds_send_queue_rm(rs, conn, rm,
+ rds_send_queue_rm(rs, conn, cpath, rm,
rs->rs_bound_port,
dport,
&queued),
@@ -1127,9 +1141,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
*/
rds_stats_inc(s_send_queued);
- ret = rds_send_xmit(conn);
+ ret = rds_send_xmit(cpath);
if (ret == -ENOMEM || ret == -EAGAIN)
- queue_delayed_work(rds_wq, &conn->c_send_w, 1);
+ queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
rds_message_put(rm);
return payload_len;
@@ -1147,10 +1161,16 @@ out:
}
/*
- * Reply to a ping packet.
+ * send out a probe. Can be shared by rds_send_ping,
+ * rds_send_pong, rds_send_hb.
+ * rds_send_hb should use h_flags
+ * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED
+ * or
+ * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
*/
int
-rds_send_pong(struct rds_connection *conn, __be16 dport)
+rds_send_probe(struct rds_conn_path *cp, __be16 sport,
+ __be16 dport, u8 h_flags)
{
struct rds_message *rm;
unsigned long flags;
@@ -1162,31 +1182,41 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
goto out;
}
- rm->m_daddr = conn->c_faddr;
+ rm->m_daddr = cp->cp_conn->c_faddr;
rm->data.op_active = 1;
- rds_conn_connect_if_down(conn);
+ rds_conn_path_connect_if_down(cp);
- ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+ ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL);
if (ret)
goto out;
- spin_lock_irqsave(&conn->c_lock, flags);
- list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
rds_message_addref(rm);
- rm->m_inc.i_conn = conn;
+ rm->m_inc.i_conn = cp->cp_conn;
+ rm->m_inc.i_conn_path = cp;
+
+ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport,
+ cp->cp_next_tx_seq);
+ rm->m_inc.i_hdr.h_flags |= h_flags;
+ cp->cp_next_tx_seq++;
+
+ if (RDS_HS_PROBE(sport, dport) && cp->cp_conn->c_trans->t_mp_capable) {
+ u16 npaths = RDS_MPATH_WORKERS;
- rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
- conn->c_next_tx_seq);
- conn->c_next_tx_seq++;
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_NPATHS, &npaths,
+ sizeof(npaths));
+ }
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
rds_stats_inc(s_send_queued);
rds_stats_inc(s_send_pong);
/* schedule the send work on rds_wq */
- queue_delayed_work(rds_wq, &conn->c_send_w, 1);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
rds_message_put(rm);
return 0;
@@ -1196,3 +1226,25 @@ out:
rds_message_put(rm);
return ret;
}
+
+int
+rds_send_pong(struct rds_conn_path *cp, __be16 dport)
+{
+ return rds_send_probe(cp, 0, dport, 0);
+}
+
+void
+rds_send_ping(struct rds_connection *conn)
+{
+ unsigned long flags;
+ struct rds_conn_path *cp = &conn->c_path[0];
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ if (conn->c_ping_triggered) {
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ return;
+ }
+ conn->c_ping_triggered = 1;
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ rds_send_probe(&conn->c_path[0], RDS_FLAG_PROBE_PORT, 0, 0);
+}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index c8a7b4c90..fcddacc92 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -56,8 +56,8 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *fpos);
-int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
-int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
+static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
+static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
static struct ctl_table rds_tcp_sysctl_table[] = {
#define RDS_TCP_SNDBUF 0
@@ -135,9 +135,9 @@ void rds_tcp_restore_callbacks(struct socket *sock,
* from being called while it isn't set.
*/
void rds_tcp_reset_callbacks(struct socket *sock,
- struct rds_connection *conn)
+ struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *osock = tc->t_sock;
if (!osock)
@@ -147,8 +147,8 @@ void rds_tcp_reset_callbacks(struct socket *sock,
* We have an outstanding SYN to this peer, which may
* potentially have transitioned to the RDS_CONN_UP state,
* so we must quiesce any send threads before resetting
- * c_transport_data. We quiesce these threads by setting
- * c_state to something other than RDS_CONN_UP, and then
+ * cp_transport_data. We quiesce these threads by setting
+ * cp_state to something other than RDS_CONN_UP, and then
* waiting for any existing threads in rds_send_xmit to
* complete release_in_xmit(). (Subsequent threads entering
* rds_send_xmit() will bail on !rds_conn_up().
@@ -163,38 +163,25 @@ void rds_tcp_reset_callbacks(struct socket *sock,
* RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
* cannot mark rds_conn_path_up() in the window before lock_sock()
*/
- atomic_set(&conn->c_state, RDS_CONN_RESETTING);
- wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags));
+ atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
+ wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
lock_sock(osock->sk);
/* reset receive side state for rds_tcp_data_recv() for osock */
+ cancel_delayed_work_sync(&cp->cp_send_w);
+ cancel_delayed_work_sync(&cp->cp_recv_w);
if (tc->t_tinc) {
rds_inc_put(&tc->t_tinc->ti_inc);
tc->t_tinc = NULL;
}
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
tc->t_tinc_data_rem = 0;
- tc->t_sock = NULL;
-
- write_lock_bh(&osock->sk->sk_callback_lock);
-
- osock->sk->sk_user_data = NULL;
- osock->sk->sk_data_ready = tc->t_orig_data_ready;
- osock->sk->sk_write_space = tc->t_orig_write_space;
- osock->sk->sk_state_change = tc->t_orig_state_change;
- write_unlock_bh(&osock->sk->sk_callback_lock);
+ rds_tcp_restore_callbacks(osock, tc);
release_sock(osock->sk);
sock_release(osock);
newsock:
- rds_send_reset(conn);
+ rds_send_path_reset(cp);
lock_sock(sock->sk);
- write_lock_bh(&sock->sk->sk_callback_lock);
- tc->t_sock = sock;
- sock->sk->sk_user_data = conn;
- sock->sk->sk_data_ready = rds_tcp_data_ready;
- sock->sk->sk_write_space = rds_tcp_write_space;
- sock->sk->sk_state_change = rds_tcp_state_change;
-
- write_unlock_bh(&sock->sk->sk_callback_lock);
+ rds_tcp_set_callbacks(sock, cp);
release_sock(sock->sk);
}
@@ -202,9 +189,9 @@ newsock:
* above rds_tcp_reset_callbacks for notes about synchronization
* with data path
*/
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
write_lock_bh(&sock->sk->sk_callback_lock);
@@ -220,12 +207,12 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
sock->sk->sk_data_ready = sock->sk->sk_user_data;
tc->t_sock = sock;
- tc->conn = conn;
+ tc->t_cpath = cp;
tc->t_orig_data_ready = sock->sk->sk_data_ready;
tc->t_orig_write_space = sock->sk->sk_write_space;
tc->t_orig_state_change = sock->sk->sk_state_change;
- sock->sk->sk_user_data = conn;
+ sock->sk->sk_user_data = cp;
sock->sk->sk_data_ready = rds_tcp_data_ready;
sock->sk->sk_write_space = rds_tcp_write_space;
sock->sk->sk_state_change = rds_tcp_state_change;
@@ -283,24 +270,29 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr)
static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct rds_tcp_connection *tc;
+ int i;
- tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
- if (!tc)
- return -ENOMEM;
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
+ if (!tc)
+ return -ENOMEM;
- mutex_init(&tc->t_conn_lock);
- tc->t_sock = NULL;
- tc->t_tinc = NULL;
- tc->t_tinc_hdr_rem = sizeof(struct rds_header);
- tc->t_tinc_data_rem = 0;
+ mutex_init(&tc->t_conn_path_lock);
+ tc->t_sock = NULL;
+ tc->t_tinc = NULL;
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
- conn->c_transport_data = tc;
+ conn->c_path[i].cp_transport_data = tc;
+ tc->t_cpath = &conn->c_path[i];
- spin_lock_irq(&rds_tcp_conn_lock);
- list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
- spin_unlock_irq(&rds_tcp_conn_lock);
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+ spin_unlock_irq(&rds_tcp_conn_lock);
+ rdsdebug("rds_conn_path [%d] tc %p\n", i,
+ conn->c_path[i].cp_transport_data);
+ }
- rdsdebug("alloced tc %p\n", conn->c_transport_data);
return 0;
}
@@ -317,6 +309,17 @@ static void rds_tcp_conn_free(void *arg)
kmem_cache_free(rds_tcp_conn_slab, tc);
}
+static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc, *_tc;
+
+ list_for_each_entry_safe(tc, _tc, list, t_tcp_node) {
+ if (tc->t_cpath->cp_conn == conn)
+ return true;
+ }
+ return false;
+}
+
static void rds_tcp_destroy_conns(void)
{
struct rds_tcp_connection *tc, *_tc;
@@ -324,29 +327,28 @@ static void rds_tcp_destroy_conns(void)
/* avoid calling conn_destroy with irqs off */
spin_lock_irq(&rds_tcp_conn_lock);
- list_splice(&rds_tcp_conn_list, &tmp_list);
- INIT_LIST_HEAD(&rds_tcp_conn_list);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+ list_move_tail(&tc->t_tcp_node, &tmp_list);
+ }
spin_unlock_irq(&rds_tcp_conn_lock);
- list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
- if (tc->conn->c_passive)
- rds_conn_destroy(tc->conn->c_passive);
- rds_conn_destroy(tc->conn);
- }
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
+ rds_conn_destroy(tc->t_cpath->cp_conn);
}
static void rds_tcp_exit(void);
struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check,
- .xmit_prepare = rds_tcp_xmit_prepare,
- .xmit_complete = rds_tcp_xmit_complete,
+ .xmit_path_prepare = rds_tcp_xmit_path_prepare,
+ .xmit_path_complete = rds_tcp_xmit_path_complete,
.xmit = rds_tcp_xmit,
- .recv = rds_tcp_recv,
+ .recv_path = rds_tcp_recv_path,
.conn_alloc = rds_tcp_conn_alloc,
.conn_free = rds_tcp_conn_free,
- .conn_connect = rds_tcp_conn_connect,
- .conn_shutdown = rds_tcp_conn_shutdown,
+ .conn_path_connect = rds_tcp_conn_path_connect,
+ .conn_path_shutdown = rds_tcp_conn_path_shutdown,
.inc_copy_to_user = rds_tcp_inc_copy_to_user,
.inc_free = rds_tcp_inc_free,
.stats_info_copy = rds_tcp_stats_info_copy,
@@ -355,6 +357,7 @@ struct rds_transport rds_tcp_transport = {
.t_name = "tcp",
.t_type = RDS_TRANS_TCP,
.t_prefer_loopback = 1,
+ .t_mp_capable = 1,
};
static int rds_tcp_netid;
@@ -488,10 +491,30 @@ static struct pernet_operations rds_tcp_net_ops = {
.size = sizeof(struct rds_tcp_net),
};
+/* explicitly send a RST on each socket, thereby releasing any socket refcnts
+ * that may otherwise hold up netns deletion.
+ */
+static void rds_tcp_conn_paths_destroy(struct rds_connection *conn)
+{
+ struct rds_conn_path *cp;
+ struct rds_tcp_connection *tc;
+ int i;
+ struct sock *sk;
+
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ cp = &conn->c_path[i];
+ tc = cp->cp_transport_data;
+ if (!tc->t_sock)
+ continue;
+ sk = tc->t_sock->sk;
+ sk->sk_prot->disconnect(sk, 0);
+ tcp_done(sk);
+ }
+}
+
static void rds_tcp_kill_sock(struct net *net)
{
struct rds_tcp_connection *tc, *_tc;
- struct sock *sk;
LIST_HEAD(tmp_list);
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
@@ -500,23 +523,27 @@ static void rds_tcp_kill_sock(struct net *net)
flush_work(&rtn->rds_tcp_accept_w);
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
- struct net *c_net = read_pnet(&tc->conn->c_net);
+ struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
if (net != c_net || !tc->t_sock)
continue;
- list_move_tail(&tc->t_tcp_node, &tmp_list);
+ if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+ list_move_tail(&tc->t_tcp_node, &tmp_list);
}
spin_unlock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
- sk = tc->t_sock->sk;
- sk->sk_prot->disconnect(sk, 0);
- tcp_done(sk);
- if (tc->conn->c_passive)
- rds_conn_destroy(tc->conn->c_passive);
- rds_conn_destroy(tc->conn);
+ rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn);
+ rds_conn_destroy(tc->t_cpath->cp_conn);
}
}
+void *rds_tcp_listen_sock_def_readable(struct net *net)
+{
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ return rtn->rds_tcp_listen_sock->sk->sk_user_data;
+}
+
static int rds_tcp_dev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
@@ -551,12 +578,13 @@ static void rds_tcp_sysctl_reset(struct net *net)
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
- struct net *c_net = read_pnet(&tc->conn->c_net);
+ struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
if (net != c_net || !tc->t_sock)
continue;
- rds_conn_drop(tc->conn); /* reconnect with new parameters */
+ /* reconnect with new parameters */
+ rds_conn_path_drop(tc->t_cpath);
}
spin_unlock_irq(&rds_tcp_conn_lock);
}
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 7940babf6..9a1cc8906 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -11,11 +11,11 @@ struct rds_tcp_incoming {
struct rds_tcp_connection {
struct list_head t_tcp_node;
- struct rds_connection *conn;
- /* t_conn_lock synchronizes the connection establishment between
- * rds_tcp_accept_one and rds_tcp_conn_connect
+ struct rds_conn_path *t_cpath;
+ /* t_conn_path_lock synchronizes the connection establishment between
+ * rds_tcp_accept_one and rds_tcp_conn_path_connect
*/
- struct mutex t_conn_lock;
+ struct mutex t_conn_path_lock;
struct socket *t_sock;
void *t_orig_write_space;
void *t_orig_data_ready;
@@ -49,8 +49,8 @@ struct rds_tcp_statistics {
/* tcp.c */
void rds_tcp_tune(struct socket *sock);
void rds_tcp_nonagle(struct socket *sock);
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
-void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
+void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_restore_callbacks(struct socket *sock,
struct rds_tcp_connection *tc);
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
@@ -60,8 +60,8 @@ extern struct rds_transport rds_tcp_transport;
void rds_tcp_accept_work(struct sock *sk);
/* tcp_connect.c */
-int rds_tcp_conn_connect(struct rds_connection *conn);
-void rds_tcp_conn_shutdown(struct rds_connection *conn);
+int rds_tcp_conn_path_connect(struct rds_conn_path *cp);
+void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
@@ -70,18 +70,19 @@ void rds_tcp_listen_stop(struct socket *);
void rds_tcp_listen_data_ready(struct sock *sk);
int rds_tcp_accept_one(struct socket *sock);
int rds_tcp_keepalive(struct socket *sock);
+void *rds_tcp_listen_sock_def_readable(struct net *net);
/* tcp_recv.c */
int rds_tcp_recv_init(void);
void rds_tcp_recv_exit(void);
void rds_tcp_data_ready(struct sock *sk);
-int rds_tcp_recv(struct rds_connection *conn);
+int rds_tcp_recv_path(struct rds_conn_path *cp);
void rds_tcp_inc_free(struct rds_incoming *inc);
int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
/* tcp_send.c */
-void rds_tcp_xmit_prepare(struct rds_connection *conn);
-void rds_tcp_xmit_complete(struct rds_connection *conn);
+void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp);
+void rds_tcp_xmit_path_complete(struct rds_conn_path *cp);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_tcp_write_space(struct sock *sk);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index f6e95d60d..05f61c533 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -40,16 +40,16 @@
void rds_tcp_state_change(struct sock *sk)
{
void (*state_change)(struct sock *sk);
- struct rds_connection *conn;
+ struct rds_conn_path *cp;
struct rds_tcp_connection *tc;
read_lock_bh(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) {
+ cp = sk->sk_user_data;
+ if (!cp) {
state_change = sk->sk_state_change;
goto out;
}
- tc = conn->c_transport_data;
+ tc = cp->cp_transport_data;
state_change = tc->t_orig_state_change;
rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
@@ -60,11 +60,11 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_SYN_RECV:
break;
case TCP_ESTABLISHED:
- rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSE:
- rds_conn_drop(conn);
+ rds_conn_path_drop(cp);
default:
break;
}
@@ -73,17 +73,24 @@ out:
state_change(sk);
}
-int rds_tcp_conn_connect(struct rds_connection *conn)
+int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
{
struct socket *sock = NULL;
struct sockaddr_in src, dest;
int ret;
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_connection *conn = cp->cp_conn;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
- mutex_lock(&tc->t_conn_lock);
+ /* for multipath rds,we only trigger the connection after
+ * the handshake probe has determined the number of paths.
+ */
+ if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2)
+ return -EAGAIN;
+
+ mutex_lock(&tc->t_conn_path_lock);
- if (rds_conn_up(conn)) {
- mutex_unlock(&tc->t_conn_lock);
+ if (rds_conn_path_up(cp)) {
+ mutex_unlock(&tc->t_conn_path_lock);
return 0;
}
ret = sock_create_kern(rds_conn_net(conn), PF_INET,
@@ -112,10 +119,11 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
* once we call connect() we can start getting callbacks and they
* own the socket
*/
- rds_tcp_set_callbacks(sock, conn);
+ rds_tcp_set_callbacks(sock, cp);
ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
O_NONBLOCK);
+ cp->cp_outgoing = 1;
rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
if (ret == -EINPROGRESS)
ret = 0;
@@ -123,11 +131,11 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
rds_tcp_keepalive(sock);
sock = NULL;
} else {
- rds_tcp_restore_callbacks(sock, conn->c_transport_data);
+ rds_tcp_restore_callbacks(sock, cp->cp_transport_data);
}
out:
- mutex_unlock(&tc->t_conn_lock);
+ mutex_unlock(&tc->t_conn_path_lock);
if (sock)
sock_release(sock);
return ret;
@@ -142,12 +150,13 @@ out:
* callbacks to those set by TCP. Our callbacks won't execute again once we
* hold the sock lock.
*/
-void rds_tcp_conn_shutdown(struct rds_connection *conn)
+void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *sock = tc->t_sock;
- rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
+ rdsdebug("shutting down conn %p tc %p sock %p\n",
+ cp->cp_conn, tc, sock);
if (sock) {
sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 245542ca4..e0b23fb5b 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -70,6 +70,52 @@ bail:
return ret;
}
+/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
+ * client's ipaddr < server's ipaddr. Otherwise, close the accepted
+ * socket and force a reconneect from smaller -> larger ip addr. The reason
+ * we special case cp_index 0 is to allow the rds probe ping itself to itself
+ * get through efficiently.
+ * Since reconnects are only initiated from the node with the numerically
+ * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
+ * by moving them to CONNECTING in this function.
+ */
+struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
+{
+ int i;
+ bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
+ int npaths = conn->c_npaths;
+
+ if (npaths <= 1) {
+ struct rds_conn_path *cp = &conn->c_path[0];
+ int ret;
+
+ ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
+ RDS_CONN_CONNECTING);
+ if (!ret)
+ rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_CONNECTING);
+ return cp->cp_transport_data;
+ }
+
+ /* for mprds, paths with cp_index > 0 MUST be initiated by the peer
+ * with the smaller address.
+ */
+ if (!peer_is_smaller)
+ return NULL;
+
+ for (i = 1; i < npaths; i++) {
+ struct rds_conn_path *cp = &conn->c_path[i];
+
+ if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
+ RDS_CONN_CONNECTING) ||
+ rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_CONNECTING)) {
+ return cp->cp_transport_data;
+ }
+ }
+ return NULL;
+}
+
int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
@@ -78,6 +124,7 @@ int rds_tcp_accept_one(struct socket *sock)
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp = NULL;
int conn_state;
+ struct rds_conn_path *cp;
if (!sock) /* module unload or netns delete in progress */
return -ENETUNREACH;
@@ -118,11 +165,14 @@ int rds_tcp_accept_one(struct socket *sock)
* If the client reboots, this conn will need to be cleaned up.
* rds_tcp_state_change() will do that cleanup
*/
- rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
- rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
- mutex_lock(&rs_tcp->t_conn_lock);
- conn_state = rds_conn_state(conn);
- if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP)
+ rs_tcp = rds_tcp_accept_one_path(conn);
+ if (!rs_tcp)
+ goto rst_nsk;
+ mutex_lock(&rs_tcp->t_conn_path_lock);
+ cp = rs_tcp->t_cpath;
+ conn_state = rds_conn_path_state(cp);
+ if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP &&
+ conn_state != RDS_CONN_ERROR)
goto rst_nsk;
if (rs_tcp->t_sock) {
/* Need to resolve a duelling SYN between peers.
@@ -132,17 +182,17 @@ int rds_tcp_accept_one(struct socket *sock)
* c_transport_data.
*/
if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) ||
- !conn->c_outgoing) {
+ !cp->cp_outgoing) {
goto rst_nsk;
} else {
- rds_tcp_reset_callbacks(new_sock, conn);
- conn->c_outgoing = 0;
+ rds_tcp_reset_callbacks(new_sock, cp);
+ cp->cp_outgoing = 0;
/* rds_connect_path_complete() marks RDS_CONN_UP */
- rds_connect_path_complete(conn, RDS_CONN_RESETTING);
+ rds_connect_path_complete(cp, RDS_CONN_RESETTING);
}
} else {
- rds_tcp_set_callbacks(new_sock, conn);
- rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+ rds_tcp_set_callbacks(new_sock, cp);
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
}
new_sock = NULL;
ret = 0;
@@ -153,7 +203,7 @@ rst_nsk:
ret = 0;
out:
if (rs_tcp)
- mutex_unlock(&rs_tcp->t_conn_lock);
+ mutex_unlock(&rs_tcp->t_conn_path_lock);
if (new_sock)
sock_release(new_sock);
return ret;
@@ -180,6 +230,8 @@ void rds_tcp_listen_data_ready(struct sock *sk)
*/
if (sk->sk_state == TCP_LISTEN)
rds_tcp_accept_work(sk);
+ else
+ ready = rds_tcp_listen_sock_def_readable(sock_net(sk));
out:
read_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 6e6a7111a..ad4892e97 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -147,7 +147,7 @@ static void rds_tcp_cong_recv(struct rds_connection *conn,
}
struct rds_tcp_desc_arg {
- struct rds_connection *conn;
+ struct rds_conn_path *conn_path;
gfp_t gfp;
};
@@ -155,8 +155,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t len)
{
struct rds_tcp_desc_arg *arg = desc->arg.data;
- struct rds_connection *conn = arg->conn;
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_conn_path *cp = arg->conn_path;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct rds_tcp_incoming *tinc = tc->t_tinc;
struct sk_buff *clone;
size_t left = len, to_copy;
@@ -178,7 +178,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
}
tc->t_tinc = tinc;
rdsdebug("alloced tinc %p\n", tinc);
- rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
+ rds_inc_path_init(&tinc->ti_inc, cp,
+ cp->cp_conn->c_faddr);
/*
* XXX * we might be able to use the __ variants when
* we've already serialized at a higher level.
@@ -228,6 +229,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
}
if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
+ struct rds_connection *conn = cp->cp_conn;
+
if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
rds_tcp_cong_recv(conn, tinc);
else
@@ -250,15 +253,15 @@ out:
}
/* the caller has to hold the sock lock */
-static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
+static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *sock = tc->t_sock;
read_descriptor_t desc;
struct rds_tcp_desc_arg arg;
/* It's like glib in the kernel! */
- arg.conn = conn;
+ arg.conn_path = cp;
arg.gfp = gfp;
desc.arg.data = &arg;
desc.error = 0;
@@ -278,16 +281,17 @@ static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
* if we fail to allocate we're in trouble.. blindly wait some time before
* trying again to see if the VM can free up something for us.
*/
-int rds_tcp_recv(struct rds_connection *conn)
+int rds_tcp_recv_path(struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *sock = tc->t_sock;
int ret = 0;
- rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
+ rdsdebug("recv worker path [%d] tc %p sock %p\n",
+ cp->cp_index, tc, sock);
lock_sock(sock->sk);
- ret = rds_tcp_read_sock(conn, GFP_KERNEL);
+ ret = rds_tcp_read_sock(cp, GFP_KERNEL);
release_sock(sock->sk);
return ret;
@@ -296,24 +300,24 @@ int rds_tcp_recv(struct rds_connection *conn)
void rds_tcp_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
- struct rds_connection *conn;
+ struct rds_conn_path *cp;
struct rds_tcp_connection *tc;
rdsdebug("data ready sk %p\n", sk);
read_lock_bh(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) { /* check for teardown race */
+ cp = sk->sk_user_data;
+ if (!cp) { /* check for teardown race */
ready = sk->sk_data_ready;
goto out;
}
- tc = conn->c_transport_data;
+ tc = cp->cp_transport_data;
ready = tc->t_orig_data_ready;
rds_tcp_stats_inc(s_tcp_data_ready_calls);
- if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM)
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
out:
read_unlock_bh(&sk->sk_callback_lock);
ready(sk);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 618be69c9..89d09b481 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -34,6 +34,7 @@
#include <linux/in.h>
#include <net/tcp.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "tcp.h"
@@ -48,16 +49,16 @@ static void rds_tcp_cork(struct socket *sock, int val)
set_fs(oldfs);
}
-void rds_tcp_xmit_prepare(struct rds_connection *conn)
+void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
rds_tcp_cork(tc->t_sock, 1);
}
-void rds_tcp_xmit_complete(struct rds_connection *conn)
+void rds_tcp_xmit_path_complete(struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
rds_tcp_cork(tc->t_sock, 0);
}
@@ -80,7 +81,8 @@ static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_conn_path *cp = rm->m_inc.i_conn_path;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
int done = 0;
int ret = 0;
int more;
@@ -149,10 +151,17 @@ out:
rds_tcp_stats_inc(s_tcp_sndbuf_full);
ret = 0;
} else {
- printk(KERN_WARNING "RDS/tcp: send to %pI4 "
- "returned %d, disconnecting and reconnecting\n",
- &conn->c_faddr, ret);
- rds_conn_drop(conn);
+ /* No need to disconnect/reconnect if path_drop
+ * has already been triggered, because, e.g., of
+ * an incoming RST.
+ */
+ if (rds_conn_path_up(cp)) {
+ pr_warn("RDS/tcp: send to %pI4 on cp [%d]"
+ "returned %d, "
+ "disconnecting and reconnecting\n",
+ &conn->c_faddr, cp->cp_index, ret);
+ rds_conn_path_drop(cp);
+ }
}
}
if (done == 0)
@@ -177,27 +186,27 @@ static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
void rds_tcp_write_space(struct sock *sk)
{
void (*write_space)(struct sock *sk);
- struct rds_connection *conn;
+ struct rds_conn_path *cp;
struct rds_tcp_connection *tc;
read_lock_bh(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) {
+ cp = sk->sk_user_data;
+ if (!cp) {
write_space = sk->sk_write_space;
goto out;
}
- tc = conn->c_transport_data;
+ tc = cp->cp_transport_data;
rdsdebug("write_space for tc %p\n", tc);
write_space = tc->t_orig_write_space;
rds_tcp_stats_inc(s_tcp_write_space_calls);
rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
tc->t_last_seen_una = rds_tcp_snd_una(tc);
- rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
+ rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked);
if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
out:
read_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 4a3230457..e42df11bf 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -71,30 +71,30 @@
struct workqueue_struct *rds_wq;
EXPORT_SYMBOL_GPL(rds_wq);
-void rds_connect_path_complete(struct rds_connection *conn, int curr)
+void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
{
- if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) {
+ if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) {
printk(KERN_WARNING "%s: Cannot transition to state UP, "
"current state is %d\n",
__func__,
- atomic_read(&conn->c_state));
- rds_conn_drop(conn);
+ atomic_read(&cp->cp_state));
+ rds_conn_path_drop(cp);
return;
}
rdsdebug("conn %p for %pI4 to %pI4 complete\n",
- conn, &conn->c_laddr, &conn->c_faddr);
+ cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
- conn->c_reconnect_jiffies = 0;
- set_bit(0, &conn->c_map_queued);
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ cp->cp_reconnect_jiffies = 0;
+ set_bit(0, &cp->cp_conn->c_map_queued);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
}
EXPORT_SYMBOL_GPL(rds_connect_path_complete);
void rds_connect_complete(struct rds_connection *conn)
{
- rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+ rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING);
}
EXPORT_SYMBOL_GPL(rds_connect_complete);
@@ -116,70 +116,87 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
* We should *always* start with a random backoff; otherwise a broken connection
* will always take several iterations to be re-established.
*/
-void rds_queue_reconnect(struct rds_connection *conn)
+void rds_queue_reconnect(struct rds_conn_path *cp)
{
unsigned long rand;
+ struct rds_connection *conn = cp->cp_conn;
rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
conn, &conn->c_laddr, &conn->c_faddr,
- conn->c_reconnect_jiffies);
+ cp->cp_reconnect_jiffies);
- set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
- if (conn->c_reconnect_jiffies == 0) {
- conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
- queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+ /* let peer with smaller addr initiate reconnect, to avoid duels */
+ if (conn->c_trans->t_type == RDS_TRANS_TCP &&
+ conn->c_laddr > conn->c_faddr)
+ return;
+
+ set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+ if (cp->cp_reconnect_jiffies == 0) {
+ cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+ queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
return;
}
get_random_bytes(&rand, sizeof(rand));
rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
- rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+ rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
conn, &conn->c_laddr, &conn->c_faddr);
- queue_delayed_work(rds_wq, &conn->c_conn_w,
- rand % conn->c_reconnect_jiffies);
+ queue_delayed_work(rds_wq, &cp->cp_conn_w,
+ rand % cp->cp_reconnect_jiffies);
- conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+ cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
rds_sysctl_reconnect_max_jiffies);
}
void rds_connect_worker(struct work_struct *work)
{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_conn_w.work);
+ struct rds_connection *conn = cp->cp_conn;
int ret;
- clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
- if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
- ret = conn->c_trans->conn_connect(conn);
+ if (cp->cp_index > 1 && cp->cp_conn->c_laddr > cp->cp_conn->c_faddr)
+ return;
+ clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+ ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
+ if (ret) {
+ ret = conn->c_trans->conn_path_connect(cp);
rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
conn, &conn->c_laddr, &conn->c_faddr, ret);
if (ret) {
- if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
- rds_queue_reconnect(conn);
+ if (rds_conn_path_transition(cp,
+ RDS_CONN_CONNECTING,
+ RDS_CONN_DOWN))
+ rds_queue_reconnect(cp);
else
- rds_conn_error(conn, "RDS: connect failed\n");
+ rds_conn_path_error(cp,
+ "RDS: connect failed\n");
}
}
}
void rds_send_worker(struct work_struct *work)
{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_send_w.work);
int ret;
- if (rds_conn_state(conn) == RDS_CONN_UP) {
- clear_bit(RDS_LL_SEND_FULL, &conn->c_flags);
- ret = rds_send_xmit(conn);
+ if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+ clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
+ ret = rds_send_xmit(cp);
cond_resched();
- rdsdebug("conn %p ret %d\n", conn, ret);
+ rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_send_immediate_retry);
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_send_delayed_retry);
- queue_delayed_work(rds_wq, &conn->c_send_w, 2);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
default:
break;
}
@@ -188,20 +205,22 @@ void rds_send_worker(struct work_struct *work)
void rds_recv_worker(struct work_struct *work)
{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_recv_w.work);
int ret;
- if (rds_conn_state(conn) == RDS_CONN_UP) {
- ret = conn->c_trans->recv(conn);
- rdsdebug("conn %p ret %d\n", conn, ret);
+ if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+ ret = cp->cp_conn->c_trans->recv_path(cp);
+ rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_recv_immediate_retry);
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_recv_delayed_retry);
- queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
default:
break;
}
@@ -210,9 +229,11 @@ void rds_recv_worker(struct work_struct *work)
void rds_shutdown_worker(struct work_struct *work)
{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_down_w);
- rds_conn_shutdown(conn);
+ rds_conn_shutdown(cp);
}
void rds_threads_exit(void)
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index e05a06ef2..10f3f48a1 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -4,25 +4,28 @@
af-rxrpc-y := \
af_rxrpc.o \
- ar-accept.o \
- ar-ack.o \
- ar-call.o \
- ar-connection.o \
- ar-connevent.o \
- ar-error.o \
- ar-input.o \
- ar-key.o \
- ar-local.o \
- ar-output.o \
- ar-peer.o \
- ar-recvmsg.o \
- ar-security.o \
- ar-skbuff.o \
- ar-transport.o \
+ call_accept.o \
+ call_event.o \
+ call_object.o \
+ conn_client.o \
+ conn_event.o \
+ conn_object.o \
+ conn_service.o \
+ input.o \
insecure.o \
- misc.o
+ key.o \
+ local_event.o \
+ local_object.o \
+ misc.o \
+ output.o \
+ peer_event.o \
+ peer_object.o \
+ recvmsg.o \
+ security.o \
+ skbuff.o \
+ utils.o
-af-rxrpc-$(CONFIG_PROC_FS) += ar-proc.o
+af-rxrpc-$(CONFIG_PROC_FS) += proc.o
af-rxrpc-$(CONFIG_RXKAD) += rxkad.o
af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index e45e94ca0..88effadd4 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -9,6 +9,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/net.h>
@@ -31,8 +33,6 @@ unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO;
module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(debug, "RxRPC debugging mask");
-static int sysctl_rxrpc_max_qlen __read_mostly = 10;
-
static struct proto rxrpc_proto;
static const struct proto_ops rxrpc_rpc_ops;
@@ -97,11 +97,13 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
srx->transport_len > len)
return -EINVAL;
- if (srx->transport.family != rx->proto)
+ if (srx->transport.family != rx->family)
return -EAFNOSUPPORT;
switch (srx->transport.family) {
case AF_INET:
+ if (srx->transport_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
_debug("INET: %x @ %pI4",
ntohs(srx->transport.sin.sin_port),
&srx->transport.sin.sin_addr);
@@ -137,33 +139,33 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
lock_sock(&rx->sk);
- if (rx->sk.sk_state != RXRPC_UNCONNECTED) {
+ if (rx->sk.sk_state != RXRPC_UNBOUND) {
ret = -EINVAL;
goto error_unlock;
}
memcpy(&rx->srx, srx, sizeof(rx->srx));
- /* Find or create a local transport endpoint to use */
local = rxrpc_lookup_local(&rx->srx);
if (IS_ERR(local)) {
ret = PTR_ERR(local);
goto error_unlock;
}
- rx->local = local;
- if (srx->srx_service) {
+ if (rx->srx.srx_service) {
write_lock_bh(&local->services_lock);
list_for_each_entry(prx, &local->services, listen_link) {
- if (prx->srx.srx_service == srx->srx_service)
+ if (prx->srx.srx_service == rx->srx.srx_service)
goto service_in_use;
}
+ rx->local = local;
list_add_tail(&rx->listen_link, &local->services);
write_unlock_bh(&local->services_lock);
rx->sk.sk_state = RXRPC_SERVER_BOUND;
} else {
+ rx->local = local;
rx->sk.sk_state = RXRPC_CLIENT_BOUND;
}
@@ -172,8 +174,9 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
return 0;
service_in_use:
- ret = -EADDRINUSE;
write_unlock_bh(&local->services_lock);
+ rxrpc_put_local(local);
+ ret = -EADDRINUSE;
error_unlock:
release_sock(&rx->sk);
error:
@@ -188,6 +191,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
struct rxrpc_sock *rx = rxrpc_sk(sk);
+ unsigned int max;
int ret;
_enter("%p,%d", rx, backlog);
@@ -195,20 +199,24 @@ static int rxrpc_listen(struct socket *sock, int backlog)
lock_sock(&rx->sk);
switch (rx->sk.sk_state) {
- case RXRPC_UNCONNECTED:
+ case RXRPC_UNBOUND:
ret = -EADDRNOTAVAIL;
break;
- case RXRPC_CLIENT_BOUND:
- case RXRPC_CLIENT_CONNECTED:
- default:
- ret = -EBUSY;
- break;
case RXRPC_SERVER_BOUND:
ASSERT(rx->local != NULL);
+ max = READ_ONCE(rxrpc_max_backlog);
+ ret = -EINVAL;
+ if (backlog == INT_MAX)
+ backlog = max;
+ else if (backlog < 0 || backlog > max)
+ break;
sk->sk_max_ack_backlog = backlog;
rx->sk.sk_state = RXRPC_SERVER_LISTENING;
ret = 0;
break;
+ default:
+ ret = -EBUSY;
+ break;
}
release_sock(&rx->sk);
@@ -216,45 +224,10 @@ static int rxrpc_listen(struct socket *sock, int backlog)
return ret;
}
-/*
- * find a transport by address
- */
-static struct rxrpc_transport *rxrpc_name_to_transport(struct socket *sock,
- struct sockaddr *addr,
- int addr_len, int flags,
- gfp_t gfp)
-{
- struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr;
- struct rxrpc_transport *trans;
- struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- struct rxrpc_peer *peer;
-
- _enter("%p,%p,%d,%d", rx, addr, addr_len, flags);
-
- ASSERT(rx->local != NULL);
- ASSERT(rx->sk.sk_state > RXRPC_UNCONNECTED);
-
- if (rx->srx.transport_type != srx->transport_type)
- return ERR_PTR(-ESOCKTNOSUPPORT);
- if (rx->srx.transport.family != srx->transport.family)
- return ERR_PTR(-EAFNOSUPPORT);
-
- /* find a remote transport endpoint from the local one */
- peer = rxrpc_get_peer(srx, gfp);
- if (IS_ERR(peer))
- return ERR_CAST(peer);
-
- /* find a transport */
- trans = rxrpc_get_transport(rx->local, peer, gfp);
- rxrpc_put_peer(peer);
- _leave(" = %p", trans);
- return trans;
-}
-
/**
* rxrpc_kernel_begin_call - Allow a kernel service to begin a call
* @sock: The socket on which to make the call
- * @srx: The address of the peer to contact (defaults to socket setting)
+ * @srx: The address of the peer to contact
* @key: The security context to use (defaults to socket setting)
* @user_call_ID: The ID to use
*
@@ -271,51 +244,32 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
unsigned long user_call_ID,
gfp_t gfp)
{
- struct rxrpc_conn_bundle *bundle;
- struct rxrpc_transport *trans;
+ struct rxrpc_conn_parameters cp;
struct rxrpc_call *call;
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ int ret;
_enter(",,%x,%lx", key_serial(key), user_call_ID);
- lock_sock(&rx->sk);
+ ret = rxrpc_validate_address(rx, srx, sizeof(*srx));
+ if (ret < 0)
+ return ERR_PTR(ret);
- if (srx) {
- trans = rxrpc_name_to_transport(sock, (struct sockaddr *) srx,
- sizeof(*srx), 0, gfp);
- if (IS_ERR(trans)) {
- call = ERR_CAST(trans);
- trans = NULL;
- goto out_notrans;
- }
- } else {
- trans = rx->trans;
- if (!trans) {
- call = ERR_PTR(-ENOTCONN);
- goto out_notrans;
- }
- atomic_inc(&trans->usage);
- }
+ lock_sock(&rx->sk);
- if (!srx)
- srx = &rx->srx;
if (!key)
key = rx->key;
if (key && !key->payload.data[0])
key = NULL; /* a no-security key */
- bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp);
- if (IS_ERR(bundle)) {
- call = ERR_CAST(bundle);
- goto out;
- }
+ memset(&cp, 0, sizeof(cp));
+ cp.local = rx->local;
+ cp.key = key;
+ cp.security_level = 0;
+ cp.exclusive = false;
+ cp.service_id = srx->srx_service;
+ call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp);
- call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, true,
- gfp);
- rxrpc_put_bundle(trans, bundle);
-out:
- rxrpc_put_transport(trans);
-out_notrans:
release_sock(&rx->sk);
_leave(" = %p", call);
return call;
@@ -367,11 +321,8 @@ EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages);
static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
int addr_len, int flags)
{
- struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr;
- struct sock *sk = sock->sk;
- struct rxrpc_transport *trans;
- struct rxrpc_local *local;
- struct rxrpc_sock *rx = rxrpc_sk(sk);
+ struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
int ret;
_enter("%p,%p,%d,%d", rx, addr, addr_len, flags);
@@ -384,45 +335,28 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
lock_sock(&rx->sk);
+ ret = -EISCONN;
+ if (test_bit(RXRPC_SOCK_CONNECTED, &rx->flags))
+ goto error;
+
switch (rx->sk.sk_state) {
- case RXRPC_UNCONNECTED:
- /* find a local transport endpoint if we don't have one already */
- ASSERTCMP(rx->local, ==, NULL);
- rx->srx.srx_family = AF_RXRPC;
- rx->srx.srx_service = 0;
- rx->srx.transport_type = srx->transport_type;
- rx->srx.transport_len = sizeof(sa_family_t);
- rx->srx.transport.family = srx->transport.family;
- local = rxrpc_lookup_local(&rx->srx);
- if (IS_ERR(local)) {
- release_sock(&rx->sk);
- return PTR_ERR(local);
- }
- rx->local = local;
- rx->sk.sk_state = RXRPC_CLIENT_BOUND;
+ case RXRPC_UNBOUND:
+ rx->sk.sk_state = RXRPC_CLIENT_UNBOUND;
+ case RXRPC_CLIENT_UNBOUND:
case RXRPC_CLIENT_BOUND:
break;
- case RXRPC_CLIENT_CONNECTED:
- release_sock(&rx->sk);
- return -EISCONN;
default:
- release_sock(&rx->sk);
- return -EBUSY; /* server sockets can't connect as well */
- }
-
- trans = rxrpc_name_to_transport(sock, addr, addr_len, flags,
- GFP_KERNEL);
- if (IS_ERR(trans)) {
- release_sock(&rx->sk);
- _leave(" = %ld", PTR_ERR(trans));
- return PTR_ERR(trans);
+ ret = -EBUSY;
+ goto error;
}
- rx->trans = trans;
- rx->sk.sk_state = RXRPC_CLIENT_CONNECTED;
+ rx->connect_srx = *srx;
+ set_bit(RXRPC_SOCK_CONNECTED, &rx->flags);
+ ret = 0;
+error:
release_sock(&rx->sk);
- return 0;
+ return ret;
}
/*
@@ -436,7 +370,7 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
*/
static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
- struct rxrpc_transport *trans;
+ struct rxrpc_local *local;
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
int ret;
@@ -453,48 +387,38 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
}
}
- trans = NULL;
lock_sock(&rx->sk);
- if (m->msg_name) {
- ret = -EISCONN;
- trans = rxrpc_name_to_transport(sock, m->msg_name,
- m->msg_namelen, 0, GFP_KERNEL);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
- } else {
- trans = rx->trans;
- if (trans)
- atomic_inc(&trans->usage);
- }
-
switch (rx->sk.sk_state) {
- case RXRPC_SERVER_LISTENING:
- if (!m->msg_name) {
- ret = rxrpc_server_sendmsg(rx, m, len);
- break;
+ case RXRPC_UNBOUND:
+ local = rxrpc_lookup_local(&rx->srx);
+ if (IS_ERR(local)) {
+ ret = PTR_ERR(local);
+ goto error_unlock;
}
- case RXRPC_SERVER_BOUND:
+
+ rx->local = local;
+ rx->sk.sk_state = RXRPC_CLIENT_UNBOUND;
+ /* Fall through */
+
+ case RXRPC_CLIENT_UNBOUND:
case RXRPC_CLIENT_BOUND:
- if (!m->msg_name) {
- ret = -ENOTCONN;
- break;
+ if (!m->msg_name &&
+ test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) {
+ m->msg_name = &rx->connect_srx;
+ m->msg_namelen = sizeof(rx->connect_srx);
}
- case RXRPC_CLIENT_CONNECTED:
- ret = rxrpc_client_sendmsg(rx, trans, m, len);
+ case RXRPC_SERVER_BOUND:
+ case RXRPC_SERVER_LISTENING:
+ ret = rxrpc_do_sendmsg(rx, m, len);
break;
default:
- ret = -ENOTCONN;
+ ret = -EINVAL;
break;
}
-out:
+error_unlock:
release_sock(&rx->sk);
- if (trans)
- rxrpc_put_transport(trans);
_leave(" = %d", ret);
return ret;
}
@@ -521,9 +445,9 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
if (optlen != 0)
goto error;
ret = -EISCONN;
- if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+ if (rx->sk.sk_state != RXRPC_UNBOUND)
goto error;
- set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags);
+ rx->exclusive = true;
goto success;
case RXRPC_SECURITY_KEY:
@@ -531,7 +455,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
if (rx->key)
goto error;
ret = -EISCONN;
- if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+ if (rx->sk.sk_state != RXRPC_UNBOUND)
goto error;
ret = rxrpc_request_key(rx, optval, optlen);
goto error;
@@ -541,7 +465,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
if (rx->key)
goto error;
ret = -EISCONN;
- if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+ if (rx->sk.sk_state != RXRPC_UNBOUND)
goto error;
ret = rxrpc_server_keyring(rx, optval, optlen);
goto error;
@@ -551,7 +475,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
if (optlen != sizeof(unsigned int))
goto error;
ret = -EISCONN;
- if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+ if (rx->sk.sk_state != RXRPC_UNBOUND)
goto error;
ret = get_user(min_sec_level,
(unsigned int __user *) optval);
@@ -630,13 +554,13 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
return -ENOMEM;
sock_init_data(sock, sk);
- sk->sk_state = RXRPC_UNCONNECTED;
+ sk->sk_state = RXRPC_UNBOUND;
sk->sk_write_space = rxrpc_write_space;
- sk->sk_max_ack_backlog = sysctl_rxrpc_max_qlen;
+ sk->sk_max_ack_backlog = 0;
sk->sk_destruct = rxrpc_sock_destructor;
rx = rxrpc_sk(sk);
- rx->proto = protocol;
+ rx->family = protocol;
rx->calls = RB_ROOT;
INIT_LIST_HEAD(&rx->listen_link);
@@ -698,24 +622,8 @@ static int rxrpc_release_sock(struct sock *sk)
flush_workqueue(rxrpc_workqueue);
rxrpc_purge_queue(&sk->sk_receive_queue);
- if (rx->conn) {
- rxrpc_put_connection(rx->conn);
- rx->conn = NULL;
- }
-
- if (rx->bundle) {
- rxrpc_put_bundle(rx->trans, rx->bundle);
- rx->bundle = NULL;
- }
- if (rx->trans) {
- rxrpc_put_transport(rx->trans);
- rx->trans = NULL;
- }
- if (rx->local) {
- rxrpc_put_local(rx->local);
- rx->local = NULL;
- }
-
+ rxrpc_put_local(rx->local);
+ rx->local = NULL;
key_put(rx->key);
rx->key = NULL;
key_put(rx->securities);
@@ -796,49 +704,49 @@ static int __init af_rxrpc_init(void)
"rxrpc_call_jar", sizeof(struct rxrpc_call), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!rxrpc_call_jar) {
- printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n");
+ pr_notice("Failed to allocate call jar\n");
goto error_call_jar;
}
rxrpc_workqueue = alloc_workqueue("krxrpcd", 0, 1);
if (!rxrpc_workqueue) {
- printk(KERN_NOTICE "RxRPC: Failed to allocate work queue\n");
+ pr_notice("Failed to allocate work queue\n");
goto error_work_queue;
}
ret = rxrpc_init_security();
if (ret < 0) {
- printk(KERN_CRIT "RxRPC: Cannot initialise security\n");
+ pr_crit("Cannot initialise security\n");
goto error_security;
}
ret = proto_register(&rxrpc_proto, 1);
if (ret < 0) {
- printk(KERN_CRIT "RxRPC: Cannot register protocol\n");
+ pr_crit("Cannot register protocol\n");
goto error_proto;
}
ret = sock_register(&rxrpc_family_ops);
if (ret < 0) {
- printk(KERN_CRIT "RxRPC: Cannot register socket family\n");
+ pr_crit("Cannot register socket family\n");
goto error_sock;
}
ret = register_key_type(&key_type_rxrpc);
if (ret < 0) {
- printk(KERN_CRIT "RxRPC: Cannot register client key type\n");
+ pr_crit("Cannot register client key type\n");
goto error_key_type;
}
ret = register_key_type(&key_type_rxrpc_s);
if (ret < 0) {
- printk(KERN_CRIT "RxRPC: Cannot register server key type\n");
+ pr_crit("Cannot register server key type\n");
goto error_key_type_s;
}
ret = rxrpc_sysctl_init();
if (ret < 0) {
- printk(KERN_CRIT "RxRPC: Cannot register sysctls\n");
+ pr_crit("Cannot register sysctls\n");
goto error_sysctls;
}
@@ -858,9 +766,9 @@ error_key_type:
error_sock:
proto_unregister(&rxrpc_proto);
error_proto:
- destroy_workqueue(rxrpc_workqueue);
-error_security:
rxrpc_exit_security();
+error_security:
+ destroy_workqueue(rxrpc_workqueue);
error_work_queue:
kmem_cache_destroy(rxrpc_call_jar);
error_call_jar:
@@ -880,14 +788,9 @@ static void __exit af_rxrpc_exit(void)
proto_unregister(&rxrpc_proto);
rxrpc_destroy_all_calls();
rxrpc_destroy_all_connections();
- rxrpc_destroy_all_transports();
- rxrpc_destroy_all_peers();
- rxrpc_destroy_all_locals();
-
ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+ rxrpc_destroy_all_locals();
- _debug("flush scheduled work");
- flush_workqueue(rxrpc_workqueue);
remove_proc_entry("rxrpc_conns", init_net.proc_net);
remove_proc_entry("rxrpc_calls", init_net.proc_net);
destroy_workqueue(rxrpc_workqueue);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f0b807a16..ff83fb1dd 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -9,7 +9,10 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/atomic.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
+#include <net/af_rxrpc.h>
#include <rxrpc/packet.h>
#if 0
@@ -33,15 +36,16 @@ struct rxrpc_crypt {
queue_delayed_work(rxrpc_workqueue, (WS), (D))
#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor)
-#define rxrpc_queue_conn(CONN) rxrpc_queue_work(&(CONN)->processor)
+
+struct rxrpc_connection;
/*
* sk_state for RxRPC sockets
*/
enum {
- RXRPC_UNCONNECTED = 0,
+ RXRPC_UNBOUND = 0,
+ RXRPC_CLIENT_UNBOUND, /* Unbound socket used as client */
RXRPC_CLIENT_BOUND, /* client local address bound */
- RXRPC_CLIENT_CONNECTED, /* client is connected */
RXRPC_SERVER_BOUND, /* server local address bound */
RXRPC_SERVER_LISTENING, /* server listening for connections */
RXRPC_CLOSE, /* socket is being closed */
@@ -55,9 +59,6 @@ struct rxrpc_sock {
struct sock sk;
rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */
struct rxrpc_local *local; /* local endpoint */
- struct rxrpc_transport *trans; /* transport handler */
- struct rxrpc_conn_bundle *bundle; /* virtual connection bundle */
- struct rxrpc_connection *conn; /* exclusive virtual connection */
struct list_head listen_link; /* link in the local endpoint's listen list */
struct list_head secureq; /* calls awaiting connection security clearance */
struct list_head acceptq; /* calls awaiting acceptance */
@@ -65,12 +66,14 @@ struct rxrpc_sock {
struct key *securities; /* list of server security descriptors */
struct rb_root calls; /* outstanding calls on this socket */
unsigned long flags;
-#define RXRPC_SOCK_EXCLUSIVE_CONN 1 /* exclusive connection for a client socket */
+#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */
rwlock_t call_lock; /* lock for calls */
u32 min_sec_level; /* minimum security level */
#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT
+ bool exclusive; /* Exclusive connection for a client socket */
+ sa_family_t family; /* Protocol family created with */
struct sockaddr_rxrpc srx; /* local address */
- sa_family_t proto; /* protocol created with */
+ struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */
};
#define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk)
@@ -138,17 +141,16 @@ struct rxrpc_security {
int (*init_connection_security)(struct rxrpc_connection *);
/* prime a connection's packet security */
- void (*prime_packet_security)(struct rxrpc_connection *);
+ int (*prime_packet_security)(struct rxrpc_connection *);
/* impose security on a packet */
- int (*secure_packet)(const struct rxrpc_call *,
+ int (*secure_packet)(struct rxrpc_call *,
struct sk_buff *,
size_t,
void *);
/* verify the security on a received packet */
- int (*verify_packet)(const struct rxrpc_call *, struct sk_buff *,
- u32 *);
+ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *);
/* issue a challenge */
int (*issue_challenge)(struct rxrpc_connection *);
@@ -168,46 +170,52 @@ struct rxrpc_security {
};
/*
- * RxRPC local transport endpoint definition
- * - matched by local port, address and protocol type
+ * RxRPC local transport endpoint description
+ * - owned by a single AF_RXRPC socket
+ * - pointed to by transport socket struct sk_user_data
*/
struct rxrpc_local {
+ struct rcu_head rcu;
+ atomic_t usage;
+ struct list_head link;
struct socket *socket; /* my UDP socket */
- struct work_struct destroyer; /* endpoint destroyer */
- struct work_struct acceptor; /* incoming call processor */
- struct work_struct rejecter; /* packet reject writer */
- struct work_struct event_processor; /* endpoint event processor */
+ struct work_struct processor;
struct list_head services; /* services listening on this endpoint */
- struct list_head link; /* link in endpoint list */
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
+ struct rb_root client_conns; /* Client connections by socket params */
+ spinlock_t client_conns_lock; /* Lock for client_conns */
spinlock_t lock; /* access lock */
rwlock_t services_lock; /* lock for services list */
- atomic_t usage;
int debug_id; /* debug ID for printks */
- volatile char error_rcvd; /* T if received ICMP error outstanding */
+ bool dead;
struct sockaddr_rxrpc srx; /* local address */
};
/*
* RxRPC remote transport endpoint definition
- * - matched by remote port, address and protocol type
- * - holds the connection ID counter for connections between the two endpoints
+ * - matched by local endpoint, remote port, address and protocol type
*/
struct rxrpc_peer {
- struct work_struct destroyer; /* peer destroyer */
- struct list_head link; /* link in master peer list */
- struct list_head error_targets; /* targets for net error distribution */
- spinlock_t lock; /* access lock */
+ struct rcu_head rcu; /* This must be first */
atomic_t usage;
+ unsigned long hash_key;
+ struct hlist_node hash_link;
+ struct rxrpc_local *local;
+ struct hlist_head error_targets; /* targets for net error distribution */
+ struct work_struct error_distributor;
+ struct rb_root service_conns; /* Service connections */
+ seqlock_t service_conn_lock;
+ spinlock_t lock; /* access lock */
unsigned int if_mtu; /* interface MTU for this peer */
unsigned int mtu; /* network MTU for this peer */
unsigned int maxdata; /* data size (MTU - hdrsize) */
unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
int debug_id; /* debug ID for printks */
- int net_error; /* network error distributed */
+ int error_report; /* Net (+0) or local (+1000000) to distribute */
+#define RXRPC_LOCAL_ERROR_OFFSET 1000000
struct sockaddr_rxrpc srx; /* remote address */
/* calculated RTT cache */
@@ -219,99 +227,108 @@ struct rxrpc_peer {
};
/*
- * RxRPC point-to-point transport / connection manager definition
- * - handles a bundle of connections between two endpoints
- * - matched by { local, peer }
- */
-struct rxrpc_transport {
- struct rxrpc_local *local; /* local transport endpoint */
- struct rxrpc_peer *peer; /* remote transport endpoint */
- struct work_struct error_handler; /* network error distributor */
- struct rb_root bundles; /* client connection bundles on this transport */
- struct rb_root client_conns; /* client connections on this transport */
- struct rb_root server_conns; /* server connections on this transport */
- struct list_head link; /* link in master session list */
- struct sk_buff_head error_queue; /* error packets awaiting processing */
- unsigned long put_time; /* time at which to reap */
- spinlock_t client_lock; /* client connection allocation lock */
- rwlock_t conn_lock; /* lock for active/dead connections */
- atomic_t usage;
- int debug_id; /* debug ID for printks */
- unsigned int conn_idcounter; /* connection ID counter (client) */
+ * Keys for matching a connection.
+ */
+struct rxrpc_conn_proto {
+ union {
+ struct {
+ u32 epoch; /* epoch of this connection */
+ u32 cid; /* connection ID */
+ };
+ u64 index_key;
+ };
+};
+
+struct rxrpc_conn_parameters {
+ struct rxrpc_local *local; /* Representation of local endpoint */
+ struct rxrpc_peer *peer; /* Remote endpoint */
+ struct key *key; /* Security details */
+ bool exclusive; /* T if conn is exclusive */
+ u16 service_id; /* Service ID for this connection */
+ u32 security_level; /* Security level selected */
};
/*
- * RxRPC client connection bundle
- * - matched by { transport, service_id, key }
+ * Bits in the connection flags.
*/
-struct rxrpc_conn_bundle {
- struct rb_node node; /* node in transport's lookup tree */
- struct list_head unused_conns; /* unused connections in this bundle */
- struct list_head avail_conns; /* available connections in this bundle */
- struct list_head busy_conns; /* busy connections in this bundle */
- struct key *key; /* security for this bundle */
- wait_queue_head_t chanwait; /* wait for channel to become available */
- atomic_t usage;
- int debug_id; /* debug ID for printks */
- unsigned short num_conns; /* number of connections in this bundle */
- u16 service_id; /* Service ID for this bundle */
- u8 security_ix; /* security type */
+enum rxrpc_conn_flag {
+ RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */
+ RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */
+ RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */
+};
+
+/*
+ * Events that can be raised upon a connection.
+ */
+enum rxrpc_conn_event {
+ RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */
+};
+
+/*
+ * The connection protocol state.
+ */
+enum rxrpc_conn_proto_state {
+ RXRPC_CONN_UNUSED, /* Connection not yet attempted */
+ RXRPC_CONN_CLIENT, /* Client connection */
+ RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */
+ RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */
+ RXRPC_CONN_SERVICE, /* Service secured connection */
+ RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */
+ RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */
+ RXRPC_CONN_NETWORK_ERROR, /* Conn terminated by network error */
+ RXRPC_CONN__NR_STATES
};
/*
* RxRPC connection definition
- * - matched by { transport, service_id, conn_id, direction, key }
+ * - matched by { local, peer, epoch, conn_id, direction }
* - each connection can only handle four simultaneous calls
*/
struct rxrpc_connection {
- struct rxrpc_transport *trans; /* transport session */
- struct rxrpc_conn_bundle *bundle; /* connection bundle (client) */
+ struct rxrpc_conn_proto proto;
+ struct rxrpc_conn_parameters params;
+
+ spinlock_t channel_lock;
+
+ struct rxrpc_channel {
+ struct rxrpc_call __rcu *call; /* Active call */
+ u32 call_id; /* ID of current call */
+ u32 call_counter; /* Call ID counter */
+ u32 last_call; /* ID of last call */
+ u32 last_result; /* Result of last call (0/abort) */
+ } channels[RXRPC_MAXCALLS];
+ wait_queue_head_t channel_wq; /* queue to wait for channel to become available */
+
+ struct rcu_head rcu;
struct work_struct processor; /* connection event processor */
- struct rb_node node; /* node in transport's lookup tree */
+ union {
+ struct rb_node client_node; /* Node in local->client_conns */
+ struct rb_node service_node; /* Node in peer->service_conns */
+ };
struct list_head link; /* link in master connection list */
- struct list_head bundle_link; /* link in bundle */
- struct rb_root calls; /* calls on this connection */
struct sk_buff_head rx_queue; /* received conn-level packets */
- struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */
const struct rxrpc_security *security; /* applied security module */
- struct key *key; /* security for this connection (client) */
struct key *server_key; /* security for this service */
struct crypto_skcipher *cipher; /* encryption handle */
struct rxrpc_crypt csum_iv; /* packet checksum base */
+ unsigned long flags;
unsigned long events;
-#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */
- unsigned long put_time; /* time at which to reap */
- rwlock_t lock; /* access lock */
+ unsigned long put_time; /* Time at which last put */
spinlock_t state_lock; /* state-change lock */
atomic_t usage;
- enum { /* current state of connection */
- RXRPC_CONN_UNUSED, /* - connection not yet attempted */
- RXRPC_CONN_CLIENT, /* - client connection */
- RXRPC_CONN_SERVER_UNSECURED, /* - server unsecured connection */
- RXRPC_CONN_SERVER_CHALLENGING, /* - server challenging for security */
- RXRPC_CONN_SERVER, /* - server secured connection */
- RXRPC_CONN_REMOTELY_ABORTED, /* - conn aborted by peer */
- RXRPC_CONN_LOCALLY_ABORTED, /* - conn aborted locally */
- RXRPC_CONN_NETWORK_ERROR, /* - conn terminated by network error */
- } state;
+ enum rxrpc_conn_proto_state state : 8; /* current state of connection */
u32 local_abort; /* local abort code */
u32 remote_abort; /* remote abort code */
int error; /* local error incurred */
int debug_id; /* debug ID for printks */
- unsigned int call_counter; /* call ID counter */
atomic_t serial; /* packet serial number counter */
atomic_t hi_serial; /* highest serial number received */
- u8 avail_calls; /* number of calls available */
+ atomic_t avail_chans; /* number of channels available */
u8 size_align; /* data size alignment (for security) */
u8 header_size; /* rxrpc + security header size */
u8 security_size; /* security header size */
- u32 security_level; /* security level negotiated */
u32 security_nonce; /* response re-use preventer */
- u32 epoch; /* epoch of this connection */
- u32 cid; /* connection ID */
- u16 service_id; /* service ID for this connection */
u8 security_ix; /* security type */
- u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
};
@@ -357,6 +374,8 @@ enum rxrpc_call_event {
* The states that a call can be in.
*/
enum rxrpc_call_state {
+ RXRPC_CALL_UNINITIALISED,
+ RXRPC_CALL_CLIENT_AWAIT_CONN, /* - client waiting for connection to become available */
RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
@@ -381,6 +400,7 @@ enum rxrpc_call_state {
* - matched by { connection, call_id }
*/
struct rxrpc_call {
+ struct rcu_head rcu;
struct rxrpc_connection *conn; /* connection carrying call */
struct rxrpc_sock *socket; /* socket responsible */
struct timer_list lifetimer; /* lifetime remaining on call */
@@ -390,14 +410,14 @@ struct rxrpc_call {
struct work_struct destroyer; /* call destroyer */
struct work_struct processor; /* packet processor and ACK generator */
struct list_head link; /* link in master call list */
- struct list_head error_link; /* link in error distribution list */
+ struct hlist_node error_link; /* link in error distribution list */
struct list_head accept_link; /* calls awaiting acceptance */
struct rb_node sock_node; /* node in socket call tree */
- struct rb_node conn_node; /* node in connection call tree */
struct sk_buff_head rx_queue; /* received packets */
struct sk_buff_head rx_oos_queue; /* packets received out of sequence */
struct sk_buff *tx_pending; /* Tx socket buffer being filled */
wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */
+ __be32 crypto_buf[2]; /* Temporary packet crypto buffer */
unsigned long user_call_ID; /* user-defined call ID */
unsigned long creation_jif; /* time of call creation */
unsigned long flags;
@@ -405,10 +425,12 @@ struct rxrpc_call {
spinlock_t lock;
rwlock_t state_lock; /* lock for state transition */
atomic_t usage;
+ atomic_t skb_count; /* Outstanding packets on this call */
atomic_t sequence; /* Tx data packet sequence counter */
u32 local_abort; /* local abort code */
u32 remote_abort; /* remote abort code */
- int error; /* local error incurred */
+ int error_report; /* Network error (ICMP/local transport) */
+ int error; /* Local error incurred */
enum rxrpc_call_state state : 8; /* current state of call */
int debug_id; /* debug ID for printks */
u8 channel; /* connection channel occupied by this call */
@@ -440,19 +462,12 @@ struct rxrpc_call {
#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG)
unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
- struct hlist_node hash_node;
- unsigned long hash_key; /* Full hash key */
- u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */
- struct rxrpc_local *local; /* Local endpoint. Used for hashing. */
- sa_family_t proto; /* Frame protocol */
+ u8 in_clientflag; /* Copy of conn->in_clientflag */
+ struct rxrpc_local *local; /* Local endpoint. */
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
u32 epoch; /* epoch of this connection */
u16 service_id; /* service ID */
- union { /* Peer IP address for hashing */
- __be32 ipv4_addr;
- __u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */
- } peer_ip;
};
/*
@@ -478,21 +493,21 @@ extern atomic_t rxrpc_debug_id;
extern struct workqueue_struct *rxrpc_workqueue;
/*
- * ar-accept.c
+ * call_accept.c
*/
-void rxrpc_accept_incoming_calls(struct work_struct *);
+void rxrpc_accept_incoming_calls(struct rxrpc_local *);
struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long);
int rxrpc_reject_call(struct rxrpc_sock *);
/*
- * ar-ack.c
+ * call_event.c
*/
void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
void rxrpc_process_call(struct work_struct *);
/*
- * ar-call.c
+ * call_object.c
*/
extern unsigned int rxrpc_max_call_lifetime;
extern unsigned int rxrpc_dead_call_expiry;
@@ -500,72 +515,106 @@ extern struct kmem_cache *rxrpc_call_jar;
extern struct list_head rxrpc_calls;
extern rwlock_t rxrpc_call_lock;
-struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *,
- void *, sa_family_t, const void *);
-struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *,
- struct rxrpc_transport *,
- struct rxrpc_conn_bundle *,
- unsigned long, int, gfp_t);
+struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
+struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
+ struct rxrpc_conn_parameters *,
+ struct sockaddr_rxrpc *,
+ unsigned long, gfp_t);
struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
struct rxrpc_connection *,
- struct rxrpc_host_header *);
-struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long);
+ struct sk_buff *);
void rxrpc_release_call(struct rxrpc_call *);
void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
void __rxrpc_put_call(struct rxrpc_call *);
void __exit rxrpc_destroy_all_calls(void);
/*
- * ar-connection.c
+ * conn_client.c
+ */
+extern struct idr rxrpc_client_conn_ids;
+
+void rxrpc_destroy_client_conn_ids(void);
+int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *,
+ struct sockaddr_rxrpc *, gfp_t);
+void rxrpc_unpublish_client_conn(struct rxrpc_connection *);
+
+/*
+ * conn_event.c
+ */
+void rxrpc_process_connection(struct work_struct *);
+void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
+void rxrpc_reject_packets(struct rxrpc_local *);
+
+/*
+ * conn_object.c
*/
extern unsigned int rxrpc_connection_expiry;
extern struct list_head rxrpc_connections;
extern rwlock_t rxrpc_connection_lock;
-struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *,
- struct rxrpc_transport *,
- struct key *, u16, gfp_t);
-void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *);
-int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *,
- struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t);
+int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
+struct rxrpc_connection *rxrpc_alloc_connection(gfp_t);
+struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
+ struct sk_buff *);
+void __rxrpc_disconnect_call(struct rxrpc_call *);
+void rxrpc_disconnect_call(struct rxrpc_call *);
void rxrpc_put_connection(struct rxrpc_connection *);
void __exit rxrpc_destroy_all_connections(void);
-struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *,
- struct rxrpc_host_header *);
-extern struct rxrpc_connection *
-rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *);
-/*
- * ar-connevent.c
- */
-void rxrpc_process_connection(struct work_struct *);
-void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
-void rxrpc_reject_packets(struct work_struct *);
+static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
+{
+ return conn->out_clientflag;
+}
+
+static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
+{
+ return !rxrpc_conn_is_client(conn);
+}
+
+static inline void rxrpc_get_connection(struct rxrpc_connection *conn)
+{
+ atomic_inc(&conn->usage);
+}
+
+static inline
+struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
+{
+ return atomic_inc_not_zero(&conn->usage) ? conn : NULL;
+}
+
+static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
+{
+ if (!rxrpc_get_connection_maybe(conn))
+ return false;
+ if (!rxrpc_queue_work(&conn->processor))
+ rxrpc_put_connection(conn);
+ return true;
+}
/*
- * ar-error.c
+ * conn_service.c
*/
-void rxrpc_UDP_error_report(struct sock *);
-void rxrpc_UDP_error_handler(struct work_struct *);
+struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
+ struct sk_buff *);
+struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *,
+ struct sockaddr_rxrpc *,
+ struct sk_buff *);
+void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
/*
- * ar-input.c
+ * input.c
*/
void rxrpc_data_ready(struct sock *);
int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
/*
- * ar-local.c
+ * insecure.c
*/
-extern rwlock_t rxrpc_local_lock;
-
-struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *);
-void rxrpc_put_local(struct rxrpc_local *);
-void __exit rxrpc_destroy_all_locals(void);
+extern const struct rxrpc_security rxrpc_no_security;
/*
- * ar-key.c
+ * key.c
*/
extern struct key_type key_type_rxrpc;
extern struct key_type key_type_rxrpc_s;
@@ -576,80 +625,108 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t,
u32);
/*
- * ar-output.c
+ * local_event.c
*/
-extern unsigned int rxrpc_resend_timeout;
-
-int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *);
-int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *,
- struct msghdr *, size_t);
-int rxrpc_server_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
+extern void rxrpc_process_local_events(struct rxrpc_local *);
/*
- * ar-peer.c
+ * local_object.c
*/
-struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t);
-void rxrpc_put_peer(struct rxrpc_peer *);
-struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *, __be32, __be16);
-void __exit rxrpc_destroy_all_peers(void);
+struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *);
+void __rxrpc_put_local(struct rxrpc_local *);
+void __exit rxrpc_destroy_all_locals(void);
-/*
- * ar-proc.c
- */
-extern const char *const rxrpc_call_states[];
-extern const struct file_operations rxrpc_call_seq_fops;
-extern const struct file_operations rxrpc_connection_seq_fops;
+static inline void rxrpc_get_local(struct rxrpc_local *local)
+{
+ atomic_inc(&local->usage);
+}
+
+static inline
+struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
+{
+ return atomic_inc_not_zero(&local->usage) ? local : NULL;
+}
+
+static inline void rxrpc_put_local(struct rxrpc_local *local)
+{
+ if (local && atomic_dec_and_test(&local->usage))
+ __rxrpc_put_local(local);
+}
+
+static inline void rxrpc_queue_local(struct rxrpc_local *local)
+{
+ rxrpc_queue_work(&local->processor);
+}
/*
- * ar-recvmsg.c
+ * misc.c
*/
-void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
-int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
+extern unsigned int rxrpc_max_backlog __read_mostly;
+extern unsigned int rxrpc_requested_ack_delay;
+extern unsigned int rxrpc_soft_ack_delay;
+extern unsigned int rxrpc_idle_ack_delay;
+extern unsigned int rxrpc_rx_window_size;
+extern unsigned int rxrpc_rx_mtu;
+extern unsigned int rxrpc_rx_jumbo_max;
+
+extern const char *const rxrpc_pkts[];
+extern const s8 rxrpc_ack_priority[];
+
+extern const char *rxrpc_acks(u8 reason);
/*
- * ar-security.c
+ * output.c
*/
-int __init rxrpc_init_security(void);
-void rxrpc_exit_security(void);
-int rxrpc_init_client_conn_security(struct rxrpc_connection *);
-int rxrpc_init_server_conn_security(struct rxrpc_connection *);
+extern unsigned int rxrpc_resend_timeout;
+
+int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
+int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
/*
- * ar-skbuff.c
+ * peer_event.c
*/
-void rxrpc_packet_destructor(struct sk_buff *);
+void rxrpc_error_report(struct sock *);
+void rxrpc_peer_error_distributor(struct work_struct *);
/*
- * ar-transport.c
+ * peer_object.c
*/
-extern unsigned int rxrpc_transport_expiry;
+struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
+ const struct sockaddr_rxrpc *);
+struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
+ struct sockaddr_rxrpc *, gfp_t);
+struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
-struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *,
- struct rxrpc_peer *, gfp_t);
-void rxrpc_put_transport(struct rxrpc_transport *);
-void __exit rxrpc_destroy_all_transports(void);
-struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *,
- struct rxrpc_peer *);
+static inline void rxrpc_get_peer(struct rxrpc_peer *peer)
+{
+ atomic_inc(&peer->usage);
+}
+
+static inline
+struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
+{
+ return atomic_inc_not_zero(&peer->usage) ? peer : NULL;
+}
+
+extern void __rxrpc_put_peer(struct rxrpc_peer *peer);
+static inline void rxrpc_put_peer(struct rxrpc_peer *peer)
+{
+ if (peer && atomic_dec_and_test(&peer->usage))
+ __rxrpc_put_peer(peer);
+}
/*
- * insecure.c
+ * proc.c
*/
-extern const struct rxrpc_security rxrpc_no_security;
+extern const char *const rxrpc_call_states[];
+extern const struct file_operations rxrpc_call_seq_fops;
+extern const struct file_operations rxrpc_connection_seq_fops;
/*
- * misc.c
+ * recvmsg.c
*/
-extern unsigned int rxrpc_requested_ack_delay;
-extern unsigned int rxrpc_soft_ack_delay;
-extern unsigned int rxrpc_idle_ack_delay;
-extern unsigned int rxrpc_rx_window_size;
-extern unsigned int rxrpc_rx_mtu;
-extern unsigned int rxrpc_rx_jumbo_max;
-
-extern const char *const rxrpc_pkts[];
-extern const s8 rxrpc_ack_priority[];
-
-extern const char *rxrpc_acks(u8 reason);
+void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
+int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
/*
* rxkad.c
@@ -659,6 +736,19 @@ extern const struct rxrpc_security rxkad;
#endif
/*
+ * security.c
+ */
+int __init rxrpc_init_security(void);
+void rxrpc_exit_security(void);
+int rxrpc_init_client_conn_security(struct rxrpc_connection *);
+int rxrpc_init_server_conn_security(struct rxrpc_connection *);
+
+/*
+ * skbuff.c
+ */
+void rxrpc_packet_destructor(struct sk_buff *);
+
+/*
* sysctl.c
*/
#ifdef CONFIG_SYSCTL
@@ -670,6 +760,11 @@ static inline void rxrpc_sysctl_exit(void) {}
#endif
/*
+ * utils.c
+ */
+int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
+
+/*
* debug tracing
*/
extern unsigned int rxrpc_debug;
@@ -744,21 +839,18 @@ do { \
#define ASSERT(X) \
do { \
if (unlikely(!(X))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "RxRPC: Assertion failed\n"); \
+ pr_err("Assertion failed\n"); \
BUG(); \
} \
} while (0)
#define ASSERTCMP(X, OP, Y) \
do { \
- if (unlikely(!((X) OP (Y)))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "RxRPC: Assertion failed\n"); \
- printk(KERN_ERR "%lu " #OP " %lu is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
+ unsigned long _x = (unsigned long)(X); \
+ unsigned long _y = (unsigned long)(Y); \
+ if (unlikely(!(_x OP _y))) { \
+ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
+ _x, _x, #OP, _y, _y); \
BUG(); \
} \
} while (0)
@@ -766,21 +858,18 @@ do { \
#define ASSERTIF(C, X) \
do { \
if (unlikely((C) && !(X))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "RxRPC: Assertion failed\n"); \
+ pr_err("Assertion failed\n"); \
BUG(); \
} \
} while (0)
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
- if (unlikely((C) && !((X) OP (Y)))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "RxRPC: Assertion failed\n"); \
- printk(KERN_ERR "%lu " #OP " %lu is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
+ unsigned long _x = (unsigned long)(X); \
+ unsigned long _y = (unsigned long)(Y); \
+ if (unlikely((C) && !(_x OP _y))) { \
+ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
+ _x, _x, #OP, _y, _y); \
BUG(); \
} \
} while (0)
@@ -844,15 +933,6 @@ static inline void rxrpc_purge_queue(struct sk_buff_head *list)
rxrpc_free_skb(skb);
}
-static inline void __rxrpc_get_local(struct rxrpc_local *local, const char *f)
-{
- CHECK_SLAB_OKAY(&local->usage);
- if (atomic_inc_return(&local->usage) == 1)
- printk("resurrected (%s)\n", f);
-}
-
-#define rxrpc_get_local(LOCAL) __rxrpc_get_local((LOCAL), __func__)
-
#define rxrpc_get_call(CALL) \
do { \
CHECK_SLAB_OKAY(&(CALL)->usage); \
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
new file mode 100644
index 000000000..9bae21e66
--- /dev/null
+++ b/net/rxrpc/call_accept.c
@@ -0,0 +1,477 @@
+/* incoming call handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+/*
+ * generate a connection-level abort
+ */
+static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
+ struct rxrpc_wire_header *whdr)
+{
+ struct msghdr msg;
+ struct kvec iov[1];
+ size_t len;
+ int ret;
+
+ _enter("%d,,", local->debug_id);
+
+ whdr->type = RXRPC_PACKET_TYPE_BUSY;
+ whdr->serial = htonl(1);
+
+ msg.msg_name = &srx->transport.sin;
+ msg.msg_namelen = sizeof(srx->transport.sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ iov[0].iov_base = whdr;
+ iov[0].iov_len = sizeof(*whdr);
+
+ len = iov[0].iov_len;
+
+ _proto("Tx BUSY %%1");
+
+ ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
+ if (ret < 0) {
+ _leave(" = -EAGAIN [sendmsg failed: %d]", ret);
+ return -EAGAIN;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * accept an incoming call that needs peer, transport and/or connection setting
+ * up
+ */
+static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
+ struct rxrpc_sock *rx,
+ struct sk_buff *skb,
+ struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_skb_priv *sp, *nsp;
+ struct rxrpc_call *call;
+ struct sk_buff *notification;
+ int ret;
+
+ _enter("");
+
+ sp = rxrpc_skb(skb);
+
+ /* get a notification message to send to the server app */
+ notification = alloc_skb(0, GFP_NOFS);
+ if (!notification) {
+ _debug("no memory");
+ ret = -ENOMEM;
+ goto error_nofree;
+ }
+ rxrpc_new_skb(notification);
+ notification->mark = RXRPC_SKB_MARK_NEW_CALL;
+
+ conn = rxrpc_incoming_connection(local, srx, skb);
+ if (IS_ERR(conn)) {
+ _debug("no conn");
+ ret = PTR_ERR(conn);
+ goto error;
+ }
+
+ call = rxrpc_incoming_call(rx, conn, skb);
+ rxrpc_put_connection(conn);
+ if (IS_ERR(call)) {
+ _debug("no call");
+ ret = PTR_ERR(call);
+ goto error;
+ }
+
+ /* attach the call to the socket */
+ read_lock_bh(&local->services_lock);
+ if (rx->sk.sk_state == RXRPC_CLOSE)
+ goto invalid_service;
+
+ write_lock(&rx->call_lock);
+ if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
+ rxrpc_get_call(call);
+
+ spin_lock(&call->conn->state_lock);
+ if (sp->hdr.securityIndex > 0 &&
+ call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) {
+ _debug("await conn sec");
+ list_add_tail(&call->accept_link, &rx->secureq);
+ call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
+ set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
+ rxrpc_queue_conn(call->conn);
+ } else {
+ _debug("conn ready");
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ list_add_tail(&call->accept_link, &rx->acceptq);
+ rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
+ nsp = rxrpc_skb(notification);
+ nsp->call = call;
+
+ ASSERTCMP(atomic_read(&call->usage), >=, 3);
+
+ _debug("notify");
+ spin_lock(&call->lock);
+ ret = rxrpc_queue_rcv_skb(call, notification, true,
+ false);
+ spin_unlock(&call->lock);
+ notification = NULL;
+ BUG_ON(ret < 0);
+ }
+ spin_unlock(&call->conn->state_lock);
+
+ _debug("queued");
+ }
+ write_unlock(&rx->call_lock);
+
+ _debug("process");
+ rxrpc_fast_process_packet(call, skb);
+
+ _debug("done");
+ read_unlock_bh(&local->services_lock);
+ rxrpc_free_skb(notification);
+ rxrpc_put_call(call);
+ _leave(" = 0");
+ return 0;
+
+invalid_service:
+ _debug("invalid");
+ read_unlock_bh(&local->services_lock);
+
+ read_lock_bh(&call->state_lock);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
+ rxrpc_get_call(call);
+ rxrpc_queue_call(call);
+ }
+ read_unlock_bh(&call->state_lock);
+ rxrpc_put_call(call);
+ ret = -ECONNREFUSED;
+error:
+ rxrpc_free_skb(notification);
+error_nofree:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * accept incoming calls that need peer, transport and/or connection setting up
+ * - the packets we get are all incoming client DATA packets that have seq == 1
+ */
+void rxrpc_accept_incoming_calls(struct rxrpc_local *local)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_sock *rx;
+ struct rxrpc_wire_header whdr;
+ struct sk_buff *skb;
+ int ret;
+
+ _enter("%d", local->debug_id);
+
+ skb = skb_dequeue(&local->accept_queue);
+ if (!skb) {
+ _leave("\n");
+ return;
+ }
+
+ _net("incoming call skb %p", skb);
+
+ sp = rxrpc_skb(skb);
+
+ /* Set up a response packet header in case we need it */
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.seq = htonl(sp->hdr.seq);
+ whdr.serial = 0;
+ whdr.flags = 0;
+ whdr.type = 0;
+ whdr.userStatus = 0;
+ whdr.securityIndex = sp->hdr.securityIndex;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(sp->hdr.serviceId);
+
+ if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
+ goto drop;
+
+ /* get the socket providing the service */
+ read_lock_bh(&local->services_lock);
+ list_for_each_entry(rx, &local->services, listen_link) {
+ if (rx->srx.srx_service == sp->hdr.serviceId &&
+ rx->sk.sk_state != RXRPC_CLOSE)
+ goto found_service;
+ }
+ read_unlock_bh(&local->services_lock);
+ goto invalid_service;
+
+found_service:
+ _debug("found service %hd", rx->srx.srx_service);
+ if (sk_acceptq_is_full(&rx->sk))
+ goto backlog_full;
+ sk_acceptq_added(&rx->sk);
+ sock_hold(&rx->sk);
+ read_unlock_bh(&local->services_lock);
+
+ ret = rxrpc_accept_incoming_call(local, rx, skb, &srx);
+ if (ret < 0)
+ sk_acceptq_removed(&rx->sk);
+ sock_put(&rx->sk);
+ switch (ret) {
+ case -ECONNRESET: /* old calls are ignored */
+ case -ECONNABORTED: /* aborted calls are reaborted or ignored */
+ case 0:
+ return;
+ case -ECONNREFUSED:
+ goto invalid_service;
+ case -EBUSY:
+ goto busy;
+ case -EKEYREJECTED:
+ goto security_mismatch;
+ default:
+ BUG();
+ }
+
+backlog_full:
+ read_unlock_bh(&local->services_lock);
+busy:
+ rxrpc_busy(local, &srx, &whdr);
+ rxrpc_free_skb(skb);
+ return;
+
+drop:
+ rxrpc_free_skb(skb);
+ return;
+
+invalid_service:
+ skb->priority = RX_INVALID_OPERATION;
+ rxrpc_reject_packet(local, skb);
+ return;
+
+ /* can't change connection security type mid-flow */
+security_mismatch:
+ skb->priority = RX_PROTOCOL_ERROR;
+ rxrpc_reject_packet(local, skb);
+ return;
+}
+
+/*
+ * handle acceptance of a call by userspace
+ * - assign the user call ID to the call at the front of the queue
+ */
+struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
+ unsigned long user_call_ID)
+{
+ struct rxrpc_call *call;
+ struct rb_node *parent, **pp;
+ int ret;
+
+ _enter(",%lx", user_call_ID);
+
+ ASSERT(!irqs_disabled());
+
+ write_lock(&rx->call_lock);
+
+ ret = -ENODATA;
+ if (list_empty(&rx->acceptq))
+ goto out;
+
+ /* check the user ID isn't already in use */
+ ret = -EBADSLT;
+ pp = &rx->calls.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ call = rb_entry(parent, struct rxrpc_call, sock_node);
+
+ if (user_call_ID < call->user_call_ID)
+ pp = &(*pp)->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ pp = &(*pp)->rb_right;
+ else
+ goto out;
+ }
+
+ /* dequeue the first call and check it's still valid */
+ call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+ list_del_init(&call->accept_link);
+ sk_acceptq_removed(&rx->sk);
+
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_SERVER_ACCEPTING:
+ call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+ break;
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ ret = -ECONNABORTED;
+ goto out_release;
+ case RXRPC_CALL_NETWORK_ERROR:
+ ret = call->conn->error;
+ goto out_release;
+ case RXRPC_CALL_DEAD:
+ ret = -ETIME;
+ goto out_discard;
+ default:
+ BUG();
+ }
+
+ /* formalise the acceptance */
+ call->user_call_ID = user_call_ID;
+ rb_link_node(&call->sock_node, parent, pp);
+ rb_insert_color(&call->sock_node, &rx->calls);
+ if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
+ BUG();
+ if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
+ BUG();
+ rxrpc_queue_call(call);
+
+ rxrpc_get_call(call);
+ write_unlock_bh(&call->state_lock);
+ write_unlock(&rx->call_lock);
+ _leave(" = %p{%d}", call, call->debug_id);
+ return call;
+
+ /* if the call is already dying or dead, then we leave the socket's ref
+ * on it to be released by rxrpc_dead_call_expired() as induced by
+ * rxrpc_release_call() */
+out_release:
+ _debug("release %p", call);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
+ rxrpc_queue_call(call);
+out_discard:
+ write_unlock_bh(&call->state_lock);
+ _debug("discard %p", call);
+out:
+ write_unlock(&rx->call_lock);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Handle rejection of a call by userspace
+ * - reject the call at the front of the queue
+ */
+int rxrpc_reject_call(struct rxrpc_sock *rx)
+{
+ struct rxrpc_call *call;
+ int ret;
+
+ _enter("");
+
+ ASSERT(!irqs_disabled());
+
+ write_lock(&rx->call_lock);
+
+ ret = -ENODATA;
+ if (list_empty(&rx->acceptq))
+ goto out;
+
+ /* dequeue the first call and check it's still valid */
+ call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+ list_del_init(&call->accept_link);
+ sk_acceptq_removed(&rx->sk);
+
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_SERVER_ACCEPTING:
+ call->state = RXRPC_CALL_SERVER_BUSY;
+ if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events))
+ rxrpc_queue_call(call);
+ ret = 0;
+ goto out_release;
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ ret = -ECONNABORTED;
+ goto out_release;
+ case RXRPC_CALL_NETWORK_ERROR:
+ ret = call->conn->error;
+ goto out_release;
+ case RXRPC_CALL_DEAD:
+ ret = -ETIME;
+ goto out_discard;
+ default:
+ BUG();
+ }
+
+ /* if the call is already dying or dead, then we leave the socket's ref
+ * on it to be released by rxrpc_dead_call_expired() as induced by
+ * rxrpc_release_call() */
+out_release:
+ _debug("release %p", call);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
+ rxrpc_queue_call(call);
+out_discard:
+ write_unlock_bh(&call->state_lock);
+ _debug("discard %p", call);
+out:
+ write_unlock(&rx->call_lock);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call
+ * @sock: The socket on which the impending call is waiting
+ * @user_call_ID: The tag to attach to the call
+ *
+ * Allow a kernel service to accept an incoming call, assuming the incoming
+ * call is still valid.
+ */
+struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
+ unsigned long user_call_ID)
+{
+ struct rxrpc_call *call;
+
+ _enter(",%lx", user_call_ID);
+ call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID);
+ _leave(" = %p", call);
+ return call;
+}
+EXPORT_SYMBOL(rxrpc_kernel_accept_call);
+
+/**
+ * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call
+ * @sock: The socket on which the impending call is waiting
+ *
+ * Allow a kernel service to reject an incoming call with a BUSY message,
+ * assuming the incoming call is still valid.
+ */
+int rxrpc_kernel_reject_call(struct socket *sock)
+{
+ int ret;
+
+ _enter("");
+ ret = rxrpc_reject_call(rxrpc_sk(sock->sk));
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(rxrpc_kernel_reject_call);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
new file mode 100644
index 000000000..e60cf65c2
--- /dev/null
+++ b/net/rxrpc/call_event.c
@@ -0,0 +1,1302 @@
+/* Management of Tx window, Tx resend, ACKs and out-of-sequence reception
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/circ_buf.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * propose an ACK be sent
+ */
+void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+ u32 serial, bool immediate)
+{
+ unsigned long expiry;
+ s8 prior = rxrpc_ack_priority[ack_reason];
+
+ ASSERTCMP(prior, >, 0);
+
+ _enter("{%d},%s,%%%x,%u",
+ call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
+
+ if (prior < rxrpc_ack_priority[call->ackr_reason]) {
+ if (immediate)
+ goto cancel_timer;
+ return;
+ }
+
+ /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
+ * numbers */
+ if (prior == rxrpc_ack_priority[call->ackr_reason]) {
+ if (prior <= 4)
+ call->ackr_serial = serial;
+ if (immediate)
+ goto cancel_timer;
+ return;
+ }
+
+ call->ackr_reason = ack_reason;
+ call->ackr_serial = serial;
+
+ switch (ack_reason) {
+ case RXRPC_ACK_DELAY:
+ _debug("run delay timer");
+ expiry = rxrpc_soft_ack_delay;
+ goto run_timer;
+
+ case RXRPC_ACK_IDLE:
+ if (!immediate) {
+ _debug("run defer timer");
+ expiry = rxrpc_idle_ack_delay;
+ goto run_timer;
+ }
+ goto cancel_timer;
+
+ case RXRPC_ACK_REQUESTED:
+ expiry = rxrpc_requested_ack_delay;
+ if (!expiry)
+ goto cancel_timer;
+ if (!immediate || serial == 1) {
+ _debug("run defer timer");
+ goto run_timer;
+ }
+
+ default:
+ _debug("immediate ACK");
+ goto cancel_timer;
+ }
+
+run_timer:
+ expiry += jiffies;
+ if (!timer_pending(&call->ack_timer) ||
+ time_after(call->ack_timer.expires, expiry))
+ mod_timer(&call->ack_timer, expiry);
+ return;
+
+cancel_timer:
+ _debug("cancel timer %%%u", serial);
+ try_to_del_timer_sync(&call->ack_timer);
+ read_lock_bh(&call->state_lock);
+ if (call->state <= RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * propose an ACK be sent, locking the call structure
+ */
+void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+ u32 serial, bool immediate)
+{
+ s8 prior = rxrpc_ack_priority[ack_reason];
+
+ if (prior > rxrpc_ack_priority[call->ackr_reason]) {
+ spin_lock_bh(&call->lock);
+ __rxrpc_propose_ACK(call, ack_reason, serial, immediate);
+ spin_unlock_bh(&call->lock);
+ }
+}
+
+/*
+ * set the resend timer
+ */
+static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
+ unsigned long resend_at)
+{
+ read_lock_bh(&call->state_lock);
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ resend = 0;
+
+ if (resend & 1) {
+ _debug("SET RESEND");
+ set_bit(RXRPC_CALL_EV_RESEND, &call->events);
+ }
+
+ if (resend & 2) {
+ _debug("MODIFY RESEND TIMER");
+ set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ mod_timer(&call->resend_timer, resend_at);
+ } else {
+ _debug("KILL RESEND TIMER");
+ del_timer_sync(&call->resend_timer);
+ clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ }
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * resend packets
+ */
+static void rxrpc_resend(struct rxrpc_call *call)
+{
+ struct rxrpc_wire_header *whdr;
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *txb;
+ unsigned long *p_txb, resend_at;
+ bool stop;
+ int loop;
+ u8 resend;
+
+ _enter("{%d,%d,%d,%d},",
+ call->acks_hard, call->acks_unacked,
+ atomic_read(&call->sequence),
+ CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
+
+ stop = false;
+ resend = 0;
+ resend_at = 0;
+
+ for (loop = call->acks_tail;
+ loop != call->acks_head || stop;
+ loop = (loop + 1) & (call->acks_winsz - 1)
+ ) {
+ p_txb = call->acks_window + loop;
+ smp_read_barrier_depends();
+ if (*p_txb & 1)
+ continue;
+
+ txb = (struct sk_buff *) *p_txb;
+ sp = rxrpc_skb(txb);
+
+ if (sp->need_resend) {
+ sp->need_resend = false;
+
+ /* each Tx packet has a new serial number */
+ sp->hdr.serial = atomic_inc_return(&call->conn->serial);
+
+ whdr = (struct rxrpc_wire_header *)txb->head;
+ whdr->serial = htonl(sp->hdr.serial);
+
+ _proto("Tx DATA %%%u { #%d }",
+ sp->hdr.serial, sp->hdr.seq);
+ if (rxrpc_send_data_packet(call->conn, txb) < 0) {
+ stop = true;
+ sp->resend_at = jiffies + 3;
+ } else {
+ sp->resend_at =
+ jiffies + rxrpc_resend_timeout;
+ }
+ }
+
+ if (time_after_eq(jiffies + 1, sp->resend_at)) {
+ sp->need_resend = true;
+ resend |= 1;
+ } else if (resend & 2) {
+ if (time_before(sp->resend_at, resend_at))
+ resend_at = sp->resend_at;
+ } else {
+ resend_at = sp->resend_at;
+ resend |= 2;
+ }
+ }
+
+ rxrpc_set_resend(call, resend, resend_at);
+ _leave("");
+}
+
+/*
+ * handle resend timer expiry
+ */
+static void rxrpc_resend_timer(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *txb;
+ unsigned long *p_txb, resend_at;
+ int loop;
+ u8 resend;
+
+ _enter("%d,%d,%d",
+ call->acks_tail, call->acks_unacked, call->acks_head);
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ resend = 0;
+ resend_at = 0;
+
+ for (loop = call->acks_unacked;
+ loop != call->acks_head;
+ loop = (loop + 1) & (call->acks_winsz - 1)
+ ) {
+ p_txb = call->acks_window + loop;
+ smp_read_barrier_depends();
+ txb = (struct sk_buff *) (*p_txb & ~1);
+ sp = rxrpc_skb(txb);
+
+ ASSERT(!(*p_txb & 1));
+
+ if (sp->need_resend) {
+ ;
+ } else if (time_after_eq(jiffies + 1, sp->resend_at)) {
+ sp->need_resend = true;
+ resend |= 1;
+ } else if (resend & 2) {
+ if (time_before(sp->resend_at, resend_at))
+ resend_at = sp->resend_at;
+ } else {
+ resend_at = sp->resend_at;
+ resend |= 2;
+ }
+ }
+
+ rxrpc_set_resend(call, resend, resend_at);
+ _leave("");
+}
+
+/*
+ * process soft ACKs of our transmitted packets
+ * - these indicate packets the peer has or has not received, but hasn't yet
+ * given to the consumer, and so can still be discarded and re-requested
+ */
+static int rxrpc_process_soft_ACKs(struct rxrpc_call *call,
+ struct rxrpc_ackpacket *ack,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *txb;
+ unsigned long *p_txb, resend_at;
+ int loop;
+ u8 sacks[RXRPC_MAXACKS], resend;
+
+ _enter("{%d,%d},{%d},",
+ call->acks_hard,
+ CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz),
+ ack->nAcks);
+
+ if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0)
+ goto protocol_error;
+
+ resend = 0;
+ resend_at = 0;
+ for (loop = 0; loop < ack->nAcks; loop++) {
+ p_txb = call->acks_window;
+ p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1);
+ smp_read_barrier_depends();
+ txb = (struct sk_buff *) (*p_txb & ~1);
+ sp = rxrpc_skb(txb);
+
+ switch (sacks[loop]) {
+ case RXRPC_ACK_TYPE_ACK:
+ sp->need_resend = false;
+ *p_txb |= 1;
+ break;
+ case RXRPC_ACK_TYPE_NACK:
+ sp->need_resend = true;
+ *p_txb &= ~1;
+ resend = 1;
+ break;
+ default:
+ _debug("Unsupported ACK type %d", sacks[loop]);
+ goto protocol_error;
+ }
+ }
+
+ smp_mb();
+ call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1);
+
+ /* anything not explicitly ACK'd is implicitly NACK'd, but may just not
+ * have been received or processed yet by the far end */
+ for (loop = call->acks_unacked;
+ loop != call->acks_head;
+ loop = (loop + 1) & (call->acks_winsz - 1)
+ ) {
+ p_txb = call->acks_window + loop;
+ smp_read_barrier_depends();
+ txb = (struct sk_buff *) (*p_txb & ~1);
+ sp = rxrpc_skb(txb);
+
+ if (*p_txb & 1) {
+ /* packet must have been discarded */
+ sp->need_resend = true;
+ *p_txb &= ~1;
+ resend |= 1;
+ } else if (sp->need_resend) {
+ ;
+ } else if (time_after_eq(jiffies + 1, sp->resend_at)) {
+ sp->need_resend = true;
+ resend |= 1;
+ } else if (resend & 2) {
+ if (time_before(sp->resend_at, resend_at))
+ resend_at = sp->resend_at;
+ } else {
+ resend_at = sp->resend_at;
+ resend |= 2;
+ }
+ }
+
+ rxrpc_set_resend(call, resend, resend_at);
+ _leave(" = 0");
+ return 0;
+
+protocol_error:
+ _leave(" = -EPROTO");
+ return -EPROTO;
+}
+
+/*
+ * discard hard-ACK'd packets from the Tx window
+ */
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
+{
+ unsigned long _skb;
+ int tail = call->acks_tail, old_tail;
+ int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
+
+ _enter("{%u,%u},%u", call->acks_hard, win, hard);
+
+ ASSERTCMP(hard - call->acks_hard, <=, win);
+
+ while (call->acks_hard < hard) {
+ smp_read_barrier_depends();
+ _skb = call->acks_window[tail] & ~1;
+ rxrpc_free_skb((struct sk_buff *) _skb);
+ old_tail = tail;
+ tail = (tail + 1) & (call->acks_winsz - 1);
+ call->acks_tail = tail;
+ if (call->acks_unacked == old_tail)
+ call->acks_unacked = tail;
+ call->acks_hard++;
+ }
+
+ wake_up(&call->tx_waitq);
+}
+
+/*
+ * clear the Tx window in the event of a failure
+ */
+static void rxrpc_clear_tx_window(struct rxrpc_call *call)
+{
+ rxrpc_rotate_tx_window(call, atomic_read(&call->sequence));
+}
+
+/*
+ * drain the out of sequence received packet queue into the packet Rx queue
+ */
+static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ bool terminal;
+ int ret;
+
+ _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos);
+
+ spin_lock_bh(&call->lock);
+
+ ret = -ECONNRESET;
+ if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
+ goto socket_unavailable;
+
+ skb = skb_dequeue(&call->rx_oos_queue);
+ if (skb) {
+ sp = rxrpc_skb(skb);
+
+ _debug("drain OOS packet %d [%d]",
+ sp->hdr.seq, call->rx_first_oos);
+
+ if (sp->hdr.seq != call->rx_first_oos) {
+ skb_queue_head(&call->rx_oos_queue, skb);
+ call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
+ _debug("requeue %p {%u}", skb, call->rx_first_oos);
+ } else {
+ skb->mark = RXRPC_SKB_MARK_DATA;
+ terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
+ !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
+ ret = rxrpc_queue_rcv_skb(call, skb, true, terminal);
+ BUG_ON(ret < 0);
+ _debug("drain #%u", call->rx_data_post);
+ call->rx_data_post++;
+
+ /* find out what the next packet is */
+ skb = skb_peek(&call->rx_oos_queue);
+ if (skb)
+ call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
+ else
+ call->rx_first_oos = 0;
+ _debug("peek %p {%u}", skb, call->rx_first_oos);
+ }
+ }
+
+ ret = 0;
+socket_unavailable:
+ spin_unlock_bh(&call->lock);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * insert an out of sequence packet into the buffer
+ */
+static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp, *psp;
+ struct sk_buff *p;
+ u32 seq;
+
+ sp = rxrpc_skb(skb);
+ seq = sp->hdr.seq;
+ _enter(",,{%u}", seq);
+
+ skb->destructor = rxrpc_packet_destructor;
+ ASSERTCMP(sp->call, ==, NULL);
+ sp->call = call;
+ rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
+
+ /* insert into the buffer in sequence order */
+ spin_lock_bh(&call->lock);
+
+ skb_queue_walk(&call->rx_oos_queue, p) {
+ psp = rxrpc_skb(p);
+ if (psp->hdr.seq > seq) {
+ _debug("insert oos #%u before #%u", seq, psp->hdr.seq);
+ skb_insert(p, skb, &call->rx_oos_queue);
+ goto inserted;
+ }
+ }
+
+ _debug("append oos #%u", seq);
+ skb_queue_tail(&call->rx_oos_queue, skb);
+inserted:
+
+ /* we might now have a new front to the queue */
+ if (call->rx_first_oos == 0 || seq < call->rx_first_oos)
+ call->rx_first_oos = seq;
+
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ call->rx_data_post == call->rx_first_oos) {
+ _debug("drain rx oos now");
+ set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
+ }
+ read_unlock(&call->state_lock);
+
+ spin_unlock_bh(&call->lock);
+ _leave(" [stored #%u]", call->rx_first_oos);
+}
+
+/*
+ * clear the Tx window on final ACK reception
+ */
+static void rxrpc_zap_tx_window(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ unsigned long _skb, *acks_window;
+ u8 winsz = call->acks_winsz;
+ int tail;
+
+ acks_window = call->acks_window;
+ call->acks_window = NULL;
+
+ while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) {
+ tail = call->acks_tail;
+ smp_read_barrier_depends();
+ _skb = acks_window[tail] & ~1;
+ smp_mb();
+ call->acks_tail = (call->acks_tail + 1) & (winsz - 1);
+
+ skb = (struct sk_buff *) _skb;
+ sp = rxrpc_skb(skb);
+ _debug("+++ clear Tx %u", sp->hdr.seq);
+ rxrpc_free_skb(skb);
+ }
+
+ kfree(acks_window);
+}
+
+/*
+ * process the extra information that may be appended to an ACK packet
+ */
+static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int latest, int nAcks)
+{
+ struct rxrpc_ackinfo ackinfo;
+ struct rxrpc_peer *peer;
+ unsigned int mtu;
+
+ if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) {
+ _leave(" [no ackinfo]");
+ return;
+ }
+
+ _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
+ latest,
+ ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU),
+ ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max));
+
+ mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
+
+ peer = call->conn->params.peer;
+ if (mtu < peer->maxdata) {
+ spin_lock_bh(&peer->lock);
+ peer->maxdata = mtu;
+ peer->mtu = mtu + peer->hdrsize;
+ spin_unlock_bh(&peer->lock);
+ _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
+ }
+}
+
+/*
+ * process packets in the reception queue
+ */
+static int rxrpc_process_rx_queue(struct rxrpc_call *call,
+ u32 *_abort_code)
+{
+ struct rxrpc_ackpacket ack;
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ bool post_ACK;
+ int latest;
+ u32 hard, tx;
+
+ _enter("");
+
+process_further:
+ skb = skb_dequeue(&call->rx_queue);
+ if (!skb)
+ return -EAGAIN;
+
+ _net("deferred skb %p", skb);
+
+ sp = rxrpc_skb(skb);
+
+ _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state);
+
+ post_ACK = false;
+
+ switch (sp->hdr.type) {
+ /* data packets that wind up here have been received out of
+ * order, need security processing or are jumbo packets */
+ case RXRPC_PACKET_TYPE_DATA:
+ _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
+
+ /* secured packets must be verified and possibly decrypted */
+ if (call->conn->security->verify_packet(call, skb,
+ _abort_code) < 0)
+ goto protocol_error;
+
+ rxrpc_insert_oos_packet(call, skb);
+ goto process_further;
+
+ /* partial ACK to process */
+ case RXRPC_PACKET_TYPE_ACK:
+ if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) {
+ _debug("extraction failure");
+ goto protocol_error;
+ }
+ if (!skb_pull(skb, sizeof(ack)))
+ BUG();
+
+ latest = sp->hdr.serial;
+ hard = ntohl(ack.firstPacket);
+ tx = atomic_read(&call->sequence);
+
+ _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+ latest,
+ ntohs(ack.maxSkew),
+ hard,
+ ntohl(ack.previousPacket),
+ ntohl(ack.serial),
+ rxrpc_acks(ack.reason),
+ ack.nAcks);
+
+ rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks);
+
+ if (ack.reason == RXRPC_ACK_PING) {
+ _proto("Rx ACK %%%u PING Request", latest);
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
+ sp->hdr.serial, true);
+ }
+
+ /* discard any out-of-order or duplicate ACKs */
+ if (latest - call->acks_latest <= 0) {
+ _debug("discard ACK %d <= %d",
+ latest, call->acks_latest);
+ goto discard;
+ }
+ call->acks_latest = latest;
+
+ if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+ call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY &&
+ call->state != RXRPC_CALL_SERVER_SEND_REPLY &&
+ call->state != RXRPC_CALL_SERVER_AWAIT_ACK)
+ goto discard;
+
+ _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state);
+
+ if (hard > 0) {
+ if (hard - 1 > tx) {
+ _debug("hard-ACK'd packet %d not transmitted"
+ " (%d top)",
+ hard - 1, tx);
+ goto protocol_error;
+ }
+
+ if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
+ call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
+ hard > tx) {
+ call->acks_hard = tx;
+ goto all_acked;
+ }
+
+ smp_rmb();
+ rxrpc_rotate_tx_window(call, hard - 1);
+ }
+
+ if (ack.nAcks > 0) {
+ if (hard - 1 + ack.nAcks > tx) {
+ _debug("soft-ACK'd packet %d+%d not"
+ " transmitted (%d top)",
+ hard - 1, ack.nAcks, tx);
+ goto protocol_error;
+ }
+
+ if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0)
+ goto protocol_error;
+ }
+ goto discard;
+
+ /* complete ACK to process */
+ case RXRPC_PACKET_TYPE_ACKALL:
+ goto all_acked;
+
+ /* abort and busy are handled elsewhere */
+ case RXRPC_PACKET_TYPE_BUSY:
+ case RXRPC_PACKET_TYPE_ABORT:
+ BUG();
+
+ /* connection level events - also handled elsewhere */
+ case RXRPC_PACKET_TYPE_CHALLENGE:
+ case RXRPC_PACKET_TYPE_RESPONSE:
+ case RXRPC_PACKET_TYPE_DEBUG:
+ BUG();
+ }
+
+ /* if we've had a hard ACK that covers all the packets we've sent, then
+ * that ends that phase of the operation */
+all_acked:
+ write_lock_bh(&call->state_lock);
+ _debug("ack all %d", call->state);
+
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ break;
+ case RXRPC_CALL_SERVER_AWAIT_ACK:
+ _debug("srv complete");
+ call->state = RXRPC_CALL_COMPLETE;
+ post_ACK = true;
+ break;
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ goto protocol_error_unlock; /* can't occur yet */
+ default:
+ write_unlock_bh(&call->state_lock);
+ goto discard; /* assume packet left over from earlier phase */
+ }
+
+ write_unlock_bh(&call->state_lock);
+
+ /* if all the packets we sent are hard-ACK'd, then we can discard
+ * whatever we've got left */
+ _debug("clear Tx %d",
+ CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
+
+ del_timer_sync(&call->resend_timer);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+
+ if (call->acks_window)
+ rxrpc_zap_tx_window(call);
+
+ if (post_ACK) {
+ /* post the final ACK message for userspace to pick up */
+ _debug("post ACK");
+ skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
+ sp->call = call;
+ rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
+ spin_lock_bh(&call->lock);
+ if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
+ BUG();
+ spin_unlock_bh(&call->lock);
+ goto process_further;
+ }
+
+discard:
+ rxrpc_free_skb(skb);
+ goto process_further;
+
+protocol_error_unlock:
+ write_unlock_bh(&call->state_lock);
+protocol_error:
+ rxrpc_free_skb(skb);
+ _leave(" = -EPROTO");
+ return -EPROTO;
+}
+
+/*
+ * post a message to the socket Rx queue for recvmsg() to pick up
+ */
+static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
+ bool fatal)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ int ret;
+
+ _enter("{%d,%lx},%u,%u,%d",
+ call->debug_id, call->flags, mark, error, fatal);
+
+ /* remove timers and things for fatal messages */
+ if (fatal) {
+ del_timer_sync(&call->resend_timer);
+ del_timer_sync(&call->ack_timer);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ }
+
+ if (mark != RXRPC_SKB_MARK_NEW_CALL &&
+ !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ _leave("[no userid]");
+ return 0;
+ }
+
+ if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
+ skb = alloc_skb(0, GFP_NOFS);
+ if (!skb)
+ return -ENOMEM;
+
+ rxrpc_new_skb(skb);
+
+ skb->mark = mark;
+
+ sp = rxrpc_skb(skb);
+ memset(sp, 0, sizeof(*sp));
+ sp->error = error;
+ sp->call = call;
+ rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
+
+ spin_lock_bh(&call->lock);
+ ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
+ spin_unlock_bh(&call->lock);
+ BUG_ON(ret < 0);
+ }
+
+ return 0;
+}
+
+/*
+ * handle background processing of incoming call packets and ACK / abort
+ * generation
+ */
+void rxrpc_process_call(struct work_struct *work)
+{
+ struct rxrpc_call *call =
+ container_of(work, struct rxrpc_call, processor);
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_ackpacket ack;
+ struct rxrpc_ackinfo ackinfo;
+ struct msghdr msg;
+ struct kvec iov[5];
+ enum rxrpc_call_event genbit;
+ unsigned long bits;
+ __be32 data, pad;
+ size_t len;
+ int loop, nbit, ioc, ret, mtu;
+ u32 serial, abort_code = RX_PROTOCOL_ERROR;
+ u8 *acks = NULL;
+
+ //printk("\n--------------------\n");
+ _enter("{%d,%s,%lx} [%lu]",
+ call->debug_id, rxrpc_call_states[call->state], call->events,
+ (jiffies - call->creation_jif) / (HZ / 10));
+
+ if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) {
+ _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX");
+ return;
+ }
+
+ if (!call->conn)
+ goto skip_msg_init;
+
+ /* there's a good chance we're going to have to send a message, so set
+ * one up in advance */
+ msg.msg_name = &call->conn->params.peer->srx.transport;
+ msg.msg_namelen = call->conn->params.peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ whdr.epoch = htonl(call->conn->proto.epoch);
+ whdr.cid = htonl(call->cid);
+ whdr.callNumber = htonl(call->call_id);
+ whdr.seq = 0;
+ whdr.type = RXRPC_PACKET_TYPE_ACK;
+ whdr.flags = call->conn->out_clientflag;
+ whdr.userStatus = 0;
+ whdr.securityIndex = call->conn->security_ix;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(call->service_id);
+
+ memset(iov, 0, sizeof(iov));
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+skip_msg_init:
+
+ /* deal with events of a final nature */
+ if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
+ enum rxrpc_skb_mark mark;
+ int error;
+
+ clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
+ clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
+
+ error = call->error_report;
+ if (error < RXRPC_LOCAL_ERROR_OFFSET) {
+ mark = RXRPC_SKB_MARK_NET_ERROR;
+ _debug("post net error %d", error);
+ } else {
+ mark = RXRPC_SKB_MARK_LOCAL_ERROR;
+ error -= RXRPC_LOCAL_ERROR_OFFSET;
+ _debug("post net local error %d", error);
+ }
+
+ if (rxrpc_post_message(call, mark, error, true) < 0)
+ goto no_mem;
+ clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
+ goto kill_ACKs;
+ }
+
+ if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) {
+ ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
+
+ clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
+
+ _debug("post conn abort");
+
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+ call->conn->error, true) < 0)
+ goto no_mem;
+ clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
+ goto kill_ACKs;
+ }
+
+ if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
+ whdr.type = RXRPC_PACKET_TYPE_BUSY;
+ genbit = RXRPC_CALL_EV_REJECT_BUSY;
+ goto send_message;
+ }
+
+ if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
+ ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
+
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+ ECONNABORTED, true) < 0)
+ goto no_mem;
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
+ data = htonl(call->local_abort);
+ iov[1].iov_base = &data;
+ iov[1].iov_len = sizeof(data);
+ genbit = RXRPC_CALL_EV_ABORT;
+ goto send_message;
+ }
+
+ if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) {
+ genbit = RXRPC_CALL_EV_ACK_FINAL;
+
+ ack.bufferSpace = htons(8);
+ ack.maxSkew = 0;
+ ack.serial = 0;
+ ack.reason = RXRPC_ACK_IDLE;
+ ack.nAcks = 0;
+ call->ackr_reason = 0;
+
+ spin_lock_bh(&call->lock);
+ ack.serial = htonl(call->ackr_serial);
+ ack.previousPacket = htonl(call->ackr_prev_seq);
+ ack.firstPacket = htonl(call->rx_data_eaten + 1);
+ spin_unlock_bh(&call->lock);
+
+ pad = 0;
+
+ iov[1].iov_base = &ack;
+ iov[1].iov_len = sizeof(ack);
+ iov[2].iov_base = &pad;
+ iov[2].iov_len = 3;
+ iov[3].iov_base = &ackinfo;
+ iov[3].iov_len = sizeof(ackinfo);
+ goto send_ACK;
+ }
+
+ if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) |
+ (1 << RXRPC_CALL_EV_RCVD_ABORT))
+ ) {
+ u32 mark;
+
+ if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events))
+ mark = RXRPC_SKB_MARK_REMOTE_ABORT;
+ else
+ mark = RXRPC_SKB_MARK_BUSY;
+
+ _debug("post abort/busy");
+ rxrpc_clear_tx_window(call);
+ if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
+ goto no_mem;
+
+ clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
+ goto kill_ACKs;
+ }
+
+ if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) {
+ _debug("do implicit ackall");
+ rxrpc_clear_tx_window(call);
+ }
+
+ if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
+ write_lock_bh(&call->state_lock);
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->local_abort = RX_CALL_TIMEOUT;
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ }
+ write_unlock_bh(&call->state_lock);
+
+ _debug("post timeout");
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+ ETIME, true) < 0)
+ goto no_mem;
+
+ clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
+ goto kill_ACKs;
+ }
+
+ /* deal with assorted inbound messages */
+ if (!skb_queue_empty(&call->rx_queue)) {
+ switch (rxrpc_process_rx_queue(call, &abort_code)) {
+ case 0:
+ case -EAGAIN:
+ break;
+ case -ENOMEM:
+ goto no_mem;
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EPROTO:
+ rxrpc_abort_call(call, abort_code);
+ goto kill_ACKs;
+ }
+ }
+
+ /* handle resending */
+ if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
+ rxrpc_resend_timer(call);
+ if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ rxrpc_resend(call);
+
+ /* consider sending an ordinary ACK */
+ if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
+ _debug("send ACK: window: %d - %d { %lx }",
+ call->rx_data_eaten, call->ackr_win_top,
+ call->ackr_window[0]);
+
+ if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
+ call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
+ /* ACK by sending reply DATA packet in this state */
+ clear_bit(RXRPC_CALL_EV_ACK, &call->events);
+ goto maybe_reschedule;
+ }
+
+ genbit = RXRPC_CALL_EV_ACK;
+
+ acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
+ GFP_NOFS);
+ if (!acks)
+ goto no_mem;
+
+ //hdr.flags = RXRPC_SLOW_START_OK;
+ ack.bufferSpace = htons(8);
+ ack.maxSkew = 0;
+
+ spin_lock_bh(&call->lock);
+ ack.reason = call->ackr_reason;
+ ack.serial = htonl(call->ackr_serial);
+ ack.previousPacket = htonl(call->ackr_prev_seq);
+ ack.firstPacket = htonl(call->rx_data_eaten + 1);
+
+ ack.nAcks = 0;
+ for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
+ nbit = loop * BITS_PER_LONG;
+ for (bits = call->ackr_window[loop]; bits; bits >>= 1
+ ) {
+ _debug("- l=%d n=%d b=%lx", loop, nbit, bits);
+ if (bits & 1) {
+ acks[nbit] = RXRPC_ACK_TYPE_ACK;
+ ack.nAcks = nbit + 1;
+ }
+ nbit++;
+ }
+ }
+ call->ackr_reason = 0;
+ spin_unlock_bh(&call->lock);
+
+ pad = 0;
+
+ iov[1].iov_base = &ack;
+ iov[1].iov_len = sizeof(ack);
+ iov[2].iov_base = acks;
+ iov[2].iov_len = ack.nAcks;
+ iov[3].iov_base = &pad;
+ iov[3].iov_len = 3;
+ iov[4].iov_base = &ackinfo;
+ iov[4].iov_len = sizeof(ackinfo);
+
+ switch (ack.reason) {
+ case RXRPC_ACK_REQUESTED:
+ case RXRPC_ACK_DUPLICATE:
+ case RXRPC_ACK_OUT_OF_SEQUENCE:
+ case RXRPC_ACK_EXCEEDS_WINDOW:
+ case RXRPC_ACK_NOSPACE:
+ case RXRPC_ACK_PING:
+ case RXRPC_ACK_PING_RESPONSE:
+ goto send_ACK_with_skew;
+ case RXRPC_ACK_DELAY:
+ case RXRPC_ACK_IDLE:
+ goto send_ACK;
+ }
+ }
+
+ /* handle completion of security negotiations on an incoming
+ * connection */
+ if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) {
+ _debug("secured");
+ spin_lock_bh(&call->lock);
+
+ if (call->state == RXRPC_CALL_SERVER_SECURING) {
+ _debug("securing");
+ write_lock(&call->socket->call_lock);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
+ _debug("not released");
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ list_move_tail(&call->accept_link,
+ &call->socket->acceptq);
+ }
+ write_unlock(&call->socket->call_lock);
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE)
+ set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
+ read_unlock(&call->state_lock);
+ }
+
+ spin_unlock_bh(&call->lock);
+ if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events))
+ goto maybe_reschedule;
+ }
+
+ /* post a notification of an acceptable connection to the app */
+ if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) {
+ _debug("post accept");
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
+ 0, false) < 0)
+ goto no_mem;
+ clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
+ goto maybe_reschedule;
+ }
+
+ /* handle incoming call acceptance */
+ if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) {
+ _debug("accepted");
+ ASSERTCMP(call->rx_data_post, ==, 0);
+ call->rx_data_post = 1;
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE)
+ set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
+ read_unlock_bh(&call->state_lock);
+ }
+
+ /* drain the out of sequence received packet queue into the packet Rx
+ * queue */
+ if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) {
+ while (call->rx_data_post == call->rx_first_oos)
+ if (rxrpc_drain_rx_oos_queue(call) < 0)
+ break;
+ goto maybe_reschedule;
+ }
+
+ if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
+ rxrpc_release_call(call);
+ clear_bit(RXRPC_CALL_EV_RELEASE, &call->events);
+ }
+
+ /* other events may have been raised since we started checking */
+ goto maybe_reschedule;
+
+send_ACK_with_skew:
+ ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) -
+ ntohl(ack.serial));
+send_ACK:
+ mtu = call->conn->params.peer->if_mtu;
+ mtu -= call->conn->params.peer->hdrsize;
+ ackinfo.maxMTU = htonl(mtu);
+ ackinfo.rwind = htonl(rxrpc_rx_window_size);
+
+ /* permit the peer to send us jumbo packets if it wants to */
+ ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
+ ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
+
+ serial = atomic_inc_return(&call->conn->serial);
+ whdr.serial = htonl(serial);
+ _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+ serial,
+ ntohs(ack.maxSkew),
+ ntohl(ack.firstPacket),
+ ntohl(ack.previousPacket),
+ ntohl(ack.serial),
+ rxrpc_acks(ack.reason),
+ ack.nAcks);
+
+ del_timer_sync(&call->ack_timer);
+ if (ack.nAcks > 0)
+ set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags);
+ goto send_message_2;
+
+send_message:
+ _debug("send message");
+
+ serial = atomic_inc_return(&call->conn->serial);
+ whdr.serial = htonl(serial);
+ _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
+send_message_2:
+
+ len = iov[0].iov_len;
+ ioc = 1;
+ if (iov[4].iov_len) {
+ ioc = 5;
+ len += iov[4].iov_len;
+ len += iov[3].iov_len;
+ len += iov[2].iov_len;
+ len += iov[1].iov_len;
+ } else if (iov[3].iov_len) {
+ ioc = 4;
+ len += iov[3].iov_len;
+ len += iov[2].iov_len;
+ len += iov[1].iov_len;
+ } else if (iov[2].iov_len) {
+ ioc = 3;
+ len += iov[2].iov_len;
+ len += iov[1].iov_len;
+ } else if (iov[1].iov_len) {
+ ioc = 2;
+ len += iov[1].iov_len;
+ }
+
+ ret = kernel_sendmsg(call->conn->params.local->socket,
+ &msg, iov, ioc, len);
+ if (ret < 0) {
+ _debug("sendmsg failed: %d", ret);
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD)
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+ goto error;
+ }
+
+ switch (genbit) {
+ case RXRPC_CALL_EV_ABORT:
+ clear_bit(genbit, &call->events);
+ clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
+ goto kill_ACKs;
+
+ case RXRPC_CALL_EV_ACK_FINAL:
+ write_lock_bh(&call->state_lock);
+ if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK)
+ call->state = RXRPC_CALL_COMPLETE;
+ write_unlock_bh(&call->state_lock);
+ goto kill_ACKs;
+
+ default:
+ clear_bit(genbit, &call->events);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ _debug("start ACK timer");
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY,
+ call->ackr_serial, false);
+ default:
+ break;
+ }
+ goto maybe_reschedule;
+ }
+
+kill_ACKs:
+ del_timer_sync(&call->ack_timer);
+ if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
+ rxrpc_put_call(call);
+ clear_bit(RXRPC_CALL_EV_ACK, &call->events);
+
+maybe_reschedule:
+ if (call->events || !skb_queue_empty(&call->rx_queue)) {
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD)
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+ }
+
+ /* don't leave aborted connections on the accept queue */
+ if (call->state >= RXRPC_CALL_COMPLETE &&
+ !list_empty(&call->accept_link)) {
+ _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
+ call, call->events, call->flags, call->conn->proto.cid);
+
+ read_lock_bh(&call->state_lock);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+ }
+
+error:
+ clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags);
+ kfree(acks);
+
+ /* because we don't want two CPUs both processing the work item for one
+ * call at the same time, we use a flag to note when it's busy; however
+ * this means there's a race between clearing the flag and setting the
+ * work pending bit and the work item being processed again */
+ if (call->events && !work_pending(&call->processor)) {
+ _debug("jumpstart %x", call->conn->proto.cid);
+ rxrpc_queue_call(call);
+ }
+
+ _leave("");
+ return;
+
+no_mem:
+ _debug("out of memory");
+ goto maybe_reschedule;
+}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
new file mode 100644
index 000000000..ae057e074
--- /dev/null
+++ b/net/rxrpc/call_object.c
@@ -0,0 +1,801 @@
+/* RxRPC individual remote procedure call handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/circ_buf.h>
+#include <linux/spinlock_types.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * Maximum lifetime of a call (in jiffies).
+ */
+unsigned int rxrpc_max_call_lifetime = 60 * HZ;
+
+/*
+ * Time till dead call expires after last use (in jiffies).
+ */
+unsigned int rxrpc_dead_call_expiry = 2 * HZ;
+
+const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
+ [RXRPC_CALL_UNINITIALISED] = "Uninit",
+ [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn",
+ [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq",
+ [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl",
+ [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl",
+ [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK",
+ [RXRPC_CALL_SERVER_SECURING] = "SvSecure",
+ [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept",
+ [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq",
+ [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq",
+ [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl",
+ [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK",
+ [RXRPC_CALL_COMPLETE] = "Complete",
+ [RXRPC_CALL_SERVER_BUSY] = "SvBusy ",
+ [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort",
+ [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort",
+ [RXRPC_CALL_NETWORK_ERROR] = "NetError",
+ [RXRPC_CALL_DEAD] = "Dead ",
+};
+
+struct kmem_cache *rxrpc_call_jar;
+LIST_HEAD(rxrpc_calls);
+DEFINE_RWLOCK(rxrpc_call_lock);
+
+static void rxrpc_destroy_call(struct work_struct *work);
+static void rxrpc_call_life_expired(unsigned long _call);
+static void rxrpc_dead_call_expired(unsigned long _call);
+static void rxrpc_ack_time_expired(unsigned long _call);
+static void rxrpc_resend_time_expired(unsigned long _call);
+
+/*
+ * find an extant server call
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx,
+ unsigned long user_call_ID)
+{
+ struct rxrpc_call *call;
+ struct rb_node *p;
+
+ _enter("%p,%lx", rx, user_call_ID);
+
+ read_lock(&rx->call_lock);
+
+ p = rx->calls.rb_node;
+ while (p) {
+ call = rb_entry(p, struct rxrpc_call, sock_node);
+
+ if (user_call_ID < call->user_call_ID)
+ p = p->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ p = p->rb_right;
+ else
+ goto found_extant_call;
+ }
+
+ read_unlock(&rx->call_lock);
+ _leave(" = NULL");
+ return NULL;
+
+found_extant_call:
+ rxrpc_get_call(call);
+ read_unlock(&rx->call_lock);
+ _leave(" = %p [%d]", call, atomic_read(&call->usage));
+ return call;
+}
+
+/*
+ * allocate a new call
+ */
+static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
+{
+ struct rxrpc_call *call;
+
+ call = kmem_cache_zalloc(rxrpc_call_jar, gfp);
+ if (!call)
+ return NULL;
+
+ call->acks_winsz = 16;
+ call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long),
+ gfp);
+ if (!call->acks_window) {
+ kmem_cache_free(rxrpc_call_jar, call);
+ return NULL;
+ }
+
+ setup_timer(&call->lifetimer, &rxrpc_call_life_expired,
+ (unsigned long) call);
+ setup_timer(&call->deadspan, &rxrpc_dead_call_expired,
+ (unsigned long) call);
+ setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
+ (unsigned long) call);
+ setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
+ (unsigned long) call);
+ INIT_WORK(&call->destroyer, &rxrpc_destroy_call);
+ INIT_WORK(&call->processor, &rxrpc_process_call);
+ INIT_LIST_HEAD(&call->link);
+ INIT_LIST_HEAD(&call->accept_link);
+ skb_queue_head_init(&call->rx_queue);
+ skb_queue_head_init(&call->rx_oos_queue);
+ init_waitqueue_head(&call->tx_waitq);
+ spin_lock_init(&call->lock);
+ rwlock_init(&call->state_lock);
+ atomic_set(&call->usage, 1);
+ call->debug_id = atomic_inc_return(&rxrpc_debug_id);
+
+ memset(&call->sock_node, 0xed, sizeof(call->sock_node));
+
+ call->rx_data_expect = 1;
+ call->rx_data_eaten = 0;
+ call->rx_first_oos = 0;
+ call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size;
+ call->creation_jif = jiffies;
+ return call;
+}
+
+/*
+ * Allocate a new client call.
+ */
+static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ struct rxrpc_call *call;
+
+ _enter("");
+
+ ASSERT(rx->local != NULL);
+
+ call = rxrpc_alloc_call(gfp);
+ if (!call)
+ return ERR_PTR(-ENOMEM);
+ call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
+
+ sock_hold(&rx->sk);
+ call->socket = rx;
+ call->rx_data_post = 1;
+
+ call->local = rx->local;
+ call->service_id = srx->srx_service;
+ call->in_clientflag = 0;
+
+ _leave(" = %p", call);
+ return call;
+}
+
+/*
+ * Begin client call.
+ */
+static int rxrpc_begin_client_call(struct rxrpc_call *call,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ int ret;
+
+ /* Set up or get a connection record and set the protocol parameters,
+ * including channel number and call ID.
+ */
+ ret = rxrpc_connect_call(call, cp, srx, gfp);
+ if (ret < 0)
+ return ret;
+
+ call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+
+ spin_lock(&call->conn->params.peer->lock);
+ hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets);
+ spin_unlock(&call->conn->params.peer->lock);
+
+ call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
+ add_timer(&call->lifetimer);
+ return 0;
+}
+
+/*
+ * set up a call for the given data
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ unsigned long user_call_ID,
+ gfp_t gfp)
+{
+ struct rxrpc_call *call, *xcall;
+ struct rb_node *parent, **pp;
+ int ret;
+
+ _enter("%p,%lx", rx, user_call_ID);
+
+ call = rxrpc_alloc_client_call(rx, srx, gfp);
+ if (IS_ERR(call)) {
+ _leave(" = %ld", PTR_ERR(call));
+ return call;
+ }
+
+ /* Publish the call, even though it is incompletely set up as yet */
+ call->user_call_ID = user_call_ID;
+ __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+
+ write_lock(&rx->call_lock);
+
+ pp = &rx->calls.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ xcall = rb_entry(parent, struct rxrpc_call, sock_node);
+
+ if (user_call_ID < xcall->user_call_ID)
+ pp = &(*pp)->rb_left;
+ else if (user_call_ID > xcall->user_call_ID)
+ pp = &(*pp)->rb_right;
+ else
+ goto found_user_ID_now_present;
+ }
+
+ rxrpc_get_call(call);
+
+ rb_link_node(&call->sock_node, parent, pp);
+ rb_insert_color(&call->sock_node, &rx->calls);
+ write_unlock(&rx->call_lock);
+
+ write_lock_bh(&rxrpc_call_lock);
+ list_add_tail(&call->link, &rxrpc_calls);
+ write_unlock_bh(&rxrpc_call_lock);
+
+ ret = rxrpc_begin_client_call(call, cp, srx, gfp);
+ if (ret < 0)
+ goto error;
+
+ _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
+
+ _leave(" = %p [new]", call);
+ return call;
+
+error:
+ write_lock(&rx->call_lock);
+ rb_erase(&call->sock_node, &rx->calls);
+ write_unlock(&rx->call_lock);
+ rxrpc_put_call(call);
+
+ write_lock_bh(&rxrpc_call_lock);
+ list_del_init(&call->link);
+ write_unlock_bh(&rxrpc_call_lock);
+
+ set_bit(RXRPC_CALL_RELEASED, &call->flags);
+ call->state = RXRPC_CALL_DEAD;
+ rxrpc_put_call(call);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+
+ /* We unexpectedly found the user ID in the list after taking
+ * the call_lock. This shouldn't happen unless the user races
+ * with itself and tries to add the same user ID twice at the
+ * same time in different threads.
+ */
+found_user_ID_now_present:
+ write_unlock(&rx->call_lock);
+ set_bit(RXRPC_CALL_RELEASED, &call->flags);
+ call->state = RXRPC_CALL_DEAD;
+ rxrpc_put_call(call);
+ _leave(" = -EEXIST [%p]", call);
+ return ERR_PTR(-EEXIST);
+}
+
+/*
+ * set up an incoming call
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
+ struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_call *call, *candidate;
+ u32 call_id, chan;
+
+ _enter(",%d", conn->debug_id);
+
+ ASSERT(rx != NULL);
+
+ candidate = rxrpc_alloc_call(GFP_NOIO);
+ if (!candidate)
+ return ERR_PTR(-EBUSY);
+
+ chan = sp->hdr.cid & RXRPC_CHANNELMASK;
+ candidate->socket = rx;
+ candidate->conn = conn;
+ candidate->cid = sp->hdr.cid;
+ candidate->call_id = sp->hdr.callNumber;
+ candidate->channel = chan;
+ candidate->rx_data_post = 0;
+ candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
+ if (conn->security_ix > 0)
+ candidate->state = RXRPC_CALL_SERVER_SECURING;
+
+ spin_lock(&conn->channel_lock);
+
+ /* set the channel for this call */
+ call = rcu_dereference_protected(conn->channels[chan].call,
+ lockdep_is_held(&conn->channel_lock));
+
+ _debug("channel[%u] is %p", candidate->channel, call);
+ if (call && call->call_id == sp->hdr.callNumber) {
+ /* already set; must've been a duplicate packet */
+ _debug("extant call [%d]", call->state);
+ ASSERTCMP(call->conn, ==, conn);
+
+ read_lock(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
+ rxrpc_queue_call(call);
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ read_unlock(&call->state_lock);
+ goto aborted_call;
+ default:
+ rxrpc_get_call(call);
+ read_unlock(&call->state_lock);
+ goto extant_call;
+ }
+ }
+
+ if (call) {
+ /* it seems the channel is still in use from the previous call
+ * - ditch the old binding if its call is now complete */
+ _debug("CALL: %u { %s }",
+ call->debug_id, rxrpc_call_states[call->state]);
+
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ __rxrpc_disconnect_call(call);
+ } else {
+ spin_unlock(&conn->channel_lock);
+ kmem_cache_free(rxrpc_call_jar, candidate);
+ _leave(" = -EBUSY");
+ return ERR_PTR(-EBUSY);
+ }
+ }
+
+ /* check the call number isn't duplicate */
+ _debug("check dup");
+ call_id = sp->hdr.callNumber;
+
+ /* We just ignore calls prior to the current call ID. Terminated calls
+ * are handled via the connection.
+ */
+ if (call_id <= conn->channels[chan].call_counter)
+ goto old_call; /* TODO: Just drop packet */
+
+ /* make the call available */
+ _debug("new call");
+ call = candidate;
+ candidate = NULL;
+ conn->channels[chan].call_counter = call_id;
+ rcu_assign_pointer(conn->channels[chan].call, call);
+ sock_hold(&rx->sk);
+ rxrpc_get_connection(conn);
+ spin_unlock(&conn->channel_lock);
+
+ spin_lock(&conn->params.peer->lock);
+ hlist_add_head(&call->error_link, &conn->params.peer->error_targets);
+ spin_unlock(&conn->params.peer->lock);
+
+ write_lock_bh(&rxrpc_call_lock);
+ list_add_tail(&call->link, &rxrpc_calls);
+ write_unlock_bh(&rxrpc_call_lock);
+
+ call->local = conn->params.local;
+ call->epoch = conn->proto.epoch;
+ call->service_id = conn->params.service_id;
+ call->in_clientflag = RXRPC_CLIENT_INITIATED;
+
+ _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
+
+ call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
+ add_timer(&call->lifetimer);
+ _leave(" = %p {%d} [new]", call, call->debug_id);
+ return call;
+
+extant_call:
+ spin_unlock(&conn->channel_lock);
+ kmem_cache_free(rxrpc_call_jar, candidate);
+ _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1);
+ return call;
+
+aborted_call:
+ spin_unlock(&conn->channel_lock);
+ kmem_cache_free(rxrpc_call_jar, candidate);
+ _leave(" = -ECONNABORTED");
+ return ERR_PTR(-ECONNABORTED);
+
+old_call:
+ spin_unlock(&conn->channel_lock);
+ kmem_cache_free(rxrpc_call_jar, candidate);
+ _leave(" = -ECONNRESET [old]");
+ return ERR_PTR(-ECONNRESET);
+}
+
+/*
+ * detach a call from a socket and set up for release
+ */
+void rxrpc_release_call(struct rxrpc_call *call)
+{
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_sock *rx = call->socket;
+
+ _enter("{%d,%d,%d,%d}",
+ call->debug_id, atomic_read(&call->usage),
+ atomic_read(&call->ackr_not_idle),
+ call->rx_first_oos);
+
+ spin_lock_bh(&call->lock);
+ if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
+ BUG();
+ spin_unlock_bh(&call->lock);
+
+ /* dissociate from the socket
+ * - the socket's ref on the call is passed to the death timer
+ */
+ _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
+
+ spin_lock(&conn->params.peer->lock);
+ hlist_del_init(&call->error_link);
+ spin_unlock(&conn->params.peer->lock);
+
+ write_lock_bh(&rx->call_lock);
+ if (!list_empty(&call->accept_link)) {
+ _debug("unlinking once-pending call %p { e=%lx f=%lx }",
+ call, call->events, call->flags);
+ ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+ list_del_init(&call->accept_link);
+ sk_acceptq_removed(&rx->sk);
+ } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ rb_erase(&call->sock_node, &rx->calls);
+ memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
+ clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ }
+ write_unlock_bh(&rx->call_lock);
+
+ /* free up the channel for reuse */
+ write_lock_bh(&call->state_lock);
+
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ call->state != RXRPC_CALL_CLIENT_FINAL_ACK) {
+ _debug("+++ ABORTING STATE %d +++\n", call->state);
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->local_abort = RX_CALL_DEAD;
+ }
+ write_unlock_bh(&call->state_lock);
+
+ rxrpc_disconnect_call(call);
+
+ /* clean up the Rx queue */
+ if (!skb_queue_empty(&call->rx_queue) ||
+ !skb_queue_empty(&call->rx_oos_queue)) {
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+
+ _debug("purge Rx queues");
+
+ spin_lock_bh(&call->lock);
+ while ((skb = skb_dequeue(&call->rx_queue)) ||
+ (skb = skb_dequeue(&call->rx_oos_queue))) {
+ spin_unlock_bh(&call->lock);
+
+ sp = rxrpc_skb(skb);
+ _debug("- zap %s %%%u #%u",
+ rxrpc_pkts[sp->hdr.type],
+ sp->hdr.serial, sp->hdr.seq);
+ rxrpc_free_skb(skb);
+ spin_lock_bh(&call->lock);
+ }
+ spin_unlock_bh(&call->lock);
+
+ ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE);
+ }
+
+ del_timer_sync(&call->resend_timer);
+ del_timer_sync(&call->ack_timer);
+ del_timer_sync(&call->lifetimer);
+ call->deadspan.expires = jiffies + rxrpc_dead_call_expiry;
+ add_timer(&call->deadspan);
+
+ _leave("");
+}
+
+/*
+ * handle a dead call being ready for reaping
+ */
+static void rxrpc_dead_call_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+ _enter("{%d}", call->debug_id);
+
+ write_lock_bh(&call->state_lock);
+ call->state = RXRPC_CALL_DEAD;
+ write_unlock_bh(&call->state_lock);
+ rxrpc_put_call(call);
+}
+
+/*
+ * mark a call as to be released, aborting it if it's still in progress
+ * - called with softirqs disabled
+ */
+static void rxrpc_mark_call_released(struct rxrpc_call *call)
+{
+ bool sched;
+
+ write_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD) {
+ sched = false;
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ _debug("abort call %p", call);
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->local_abort = RX_CALL_DEAD;
+ if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
+ sched = true;
+ }
+ if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
+ sched = true;
+ if (sched)
+ rxrpc_queue_call(call);
+ }
+ write_unlock(&call->state_lock);
+}
+
+/*
+ * release all the calls associated with a socket
+ */
+void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
+{
+ struct rxrpc_call *call;
+ struct rb_node *p;
+
+ _enter("%p", rx);
+
+ read_lock_bh(&rx->call_lock);
+
+ /* mark all the calls as no longer wanting incoming packets */
+ for (p = rb_first(&rx->calls); p; p = rb_next(p)) {
+ call = rb_entry(p, struct rxrpc_call, sock_node);
+ rxrpc_mark_call_released(call);
+ }
+
+ /* kill the not-yet-accepted incoming calls */
+ list_for_each_entry(call, &rx->secureq, accept_link) {
+ rxrpc_mark_call_released(call);
+ }
+
+ list_for_each_entry(call, &rx->acceptq, accept_link) {
+ rxrpc_mark_call_released(call);
+ }
+
+ read_unlock_bh(&rx->call_lock);
+ _leave("");
+}
+
+/*
+ * release a call
+ */
+void __rxrpc_put_call(struct rxrpc_call *call)
+{
+ ASSERT(call != NULL);
+
+ _enter("%p{u=%d}", call, atomic_read(&call->usage));
+
+ ASSERTCMP(atomic_read(&call->usage), >, 0);
+
+ if (atomic_dec_and_test(&call->usage)) {
+ _debug("call %d dead", call->debug_id);
+ WARN_ON(atomic_read(&call->skb_count) != 0);
+ ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+ rxrpc_queue_work(&call->destroyer);
+ }
+ _leave("");
+}
+
+/*
+ * Final call destruction under RCU.
+ */
+static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
+{
+ struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
+
+ rxrpc_purge_queue(&call->rx_queue);
+ kmem_cache_free(rxrpc_call_jar, call);
+}
+
+/*
+ * clean up a call
+ */
+static void rxrpc_cleanup_call(struct rxrpc_call *call)
+{
+ _net("DESTROY CALL %d", call->debug_id);
+
+ ASSERT(call->socket);
+
+ memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
+
+ del_timer_sync(&call->lifetimer);
+ del_timer_sync(&call->deadspan);
+ del_timer_sync(&call->ack_timer);
+ del_timer_sync(&call->resend_timer);
+
+ ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
+ ASSERTCMP(call->events, ==, 0);
+ if (work_pending(&call->processor)) {
+ _debug("defer destroy");
+ rxrpc_queue_work(&call->destroyer);
+ return;
+ }
+
+ ASSERTCMP(call->conn, ==, NULL);
+
+ if (call->acks_window) {
+ _debug("kill Tx window %d",
+ CIRC_CNT(call->acks_head, call->acks_tail,
+ call->acks_winsz));
+ smp_mb();
+ while (CIRC_CNT(call->acks_head, call->acks_tail,
+ call->acks_winsz) > 0) {
+ struct rxrpc_skb_priv *sp;
+ unsigned long _skb;
+
+ _skb = call->acks_window[call->acks_tail] & ~1;
+ sp = rxrpc_skb((struct sk_buff *)_skb);
+ _debug("+++ clear Tx %u", sp->hdr.seq);
+ rxrpc_free_skb((struct sk_buff *)_skb);
+ call->acks_tail =
+ (call->acks_tail + 1) & (call->acks_winsz - 1);
+ }
+
+ kfree(call->acks_window);
+ }
+
+ rxrpc_free_skb(call->tx_pending);
+
+ rxrpc_purge_queue(&call->rx_queue);
+ ASSERT(skb_queue_empty(&call->rx_oos_queue));
+ sock_put(&call->socket->sk);
+ call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
+}
+
+/*
+ * destroy a call
+ */
+static void rxrpc_destroy_call(struct work_struct *work)
+{
+ struct rxrpc_call *call =
+ container_of(work, struct rxrpc_call, destroyer);
+
+ _enter("%p{%d,%d,%p}",
+ call, atomic_read(&call->usage), call->channel, call->conn);
+
+ ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+
+ write_lock_bh(&rxrpc_call_lock);
+ list_del_init(&call->link);
+ write_unlock_bh(&rxrpc_call_lock);
+
+ rxrpc_cleanup_call(call);
+ _leave("");
+}
+
+/*
+ * preemptively destroy all the call records from a transport endpoint rather
+ * than waiting for them to time out
+ */
+void __exit rxrpc_destroy_all_calls(void)
+{
+ struct rxrpc_call *call;
+
+ _enter("");
+ write_lock_bh(&rxrpc_call_lock);
+
+ while (!list_empty(&rxrpc_calls)) {
+ call = list_entry(rxrpc_calls.next, struct rxrpc_call, link);
+ _debug("Zapping call %p", call);
+
+ list_del_init(&call->link);
+
+ switch (atomic_read(&call->usage)) {
+ case 0:
+ ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+ break;
+ case 1:
+ if (del_timer_sync(&call->deadspan) != 0 &&
+ call->state != RXRPC_CALL_DEAD)
+ rxrpc_dead_call_expired((unsigned long) call);
+ if (call->state != RXRPC_CALL_DEAD)
+ break;
+ default:
+ pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
+ call, atomic_read(&call->usage),
+ atomic_read(&call->ackr_not_idle),
+ rxrpc_call_states[call->state],
+ call->flags, call->events);
+ if (!skb_queue_empty(&call->rx_queue))
+ pr_err("Rx queue occupied\n");
+ if (!skb_queue_empty(&call->rx_oos_queue))
+ pr_err("OOS queue occupied\n");
+ break;
+ }
+
+ write_unlock_bh(&rxrpc_call_lock);
+ cond_resched();
+ write_lock_bh(&rxrpc_call_lock);
+ }
+
+ write_unlock_bh(&rxrpc_call_lock);
+ _leave("");
+}
+
+/*
+ * handle call lifetime being exceeded
+ */
+static void rxrpc_call_life_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ _enter("{%d}", call->debug_id);
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
+ rxrpc_queue_call(call);
+ }
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * handle resend timer expiry
+ * - may not take call->state_lock as this can deadlock against del_timer_sync()
+ */
+static void rxrpc_resend_time_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+ _enter("{%d}", call->debug_id);
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
+ rxrpc_queue_call(call);
+}
+
+/*
+ * handle ACK timer expiry
+ */
+static void rxrpc_ack_time_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+ _enter("{%d}", call->debug_id);
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+}
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
new file mode 100644
index 000000000..9e91f27b0
--- /dev/null
+++ b/net/rxrpc/conn_client.c
@@ -0,0 +1,372 @@
+/* Client connection-specific management code.
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/timer.h>
+#include "ar-internal.h"
+
+/*
+ * We use machine-unique IDs for our client connections.
+ */
+DEFINE_IDR(rxrpc_client_conn_ids);
+static DEFINE_SPINLOCK(rxrpc_conn_id_lock);
+
+/*
+ * Get a connection ID and epoch for a client connection from the global pool.
+ * The connection struct pointer is then recorded in the idr radix tree. The
+ * epoch is changed if this wraps.
+ *
+ * TODO: The IDR tree gets very expensive on memory if the connection IDs are
+ * widely scattered throughout the number space, so we shall need to retire
+ * connections that have, say, an ID more than four times the maximum number of
+ * client conns away from the current allocation point to try and keep the IDs
+ * concentrated. We will also need to retire connections from an old epoch.
+ */
+static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn,
+ gfp_t gfp)
+{
+ u32 epoch;
+ int id;
+
+ _enter("");
+
+ idr_preload(gfp);
+ spin_lock(&rxrpc_conn_id_lock);
+
+ epoch = rxrpc_epoch;
+
+ /* We could use idr_alloc_cyclic() here, but we really need to know
+ * when the thing wraps so that we can advance the epoch.
+ */
+ if (rxrpc_client_conn_ids.cur == 0)
+ rxrpc_client_conn_ids.cur = 1;
+ id = idr_alloc(&rxrpc_client_conn_ids, conn,
+ rxrpc_client_conn_ids.cur, 0x40000000, GFP_NOWAIT);
+ if (id < 0) {
+ if (id != -ENOSPC)
+ goto error;
+ id = idr_alloc(&rxrpc_client_conn_ids, conn,
+ 1, 0x40000000, GFP_NOWAIT);
+ if (id < 0)
+ goto error;
+ epoch++;
+ rxrpc_epoch = epoch;
+ }
+ rxrpc_client_conn_ids.cur = id + 1;
+
+ spin_unlock(&rxrpc_conn_id_lock);
+ idr_preload_end();
+
+ conn->proto.epoch = epoch;
+ conn->proto.cid = id << RXRPC_CIDSHIFT;
+ set_bit(RXRPC_CONN_HAS_IDR, &conn->flags);
+ _leave(" [CID %x:%x]", epoch, conn->proto.cid);
+ return 0;
+
+error:
+ spin_unlock(&rxrpc_conn_id_lock);
+ idr_preload_end();
+ _leave(" = %d", id);
+ return id;
+}
+
+/*
+ * Release a connection ID for a client connection from the global pool.
+ */
+static void rxrpc_put_client_connection_id(struct rxrpc_connection *conn)
+{
+ if (test_bit(RXRPC_CONN_HAS_IDR, &conn->flags)) {
+ spin_lock(&rxrpc_conn_id_lock);
+ idr_remove(&rxrpc_client_conn_ids,
+ conn->proto.cid >> RXRPC_CIDSHIFT);
+ spin_unlock(&rxrpc_conn_id_lock);
+ }
+}
+
+/*
+ * Destroy the client connection ID tree.
+ */
+void rxrpc_destroy_client_conn_ids(void)
+{
+ struct rxrpc_connection *conn;
+ int id;
+
+ if (!idr_is_empty(&rxrpc_client_conn_ids)) {
+ idr_for_each_entry(&rxrpc_client_conn_ids, conn, id) {
+ pr_err("AF_RXRPC: Leaked client conn %p {%d}\n",
+ conn, atomic_read(&conn->usage));
+ }
+ BUG();
+ }
+
+ idr_destroy(&rxrpc_client_conn_ids);
+}
+
+/*
+ * Allocate a client connection. The caller must take care to clear any
+ * padding bytes in *cp.
+ */
+static struct rxrpc_connection *
+rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
+{
+ struct rxrpc_connection *conn;
+ int ret;
+
+ _enter("");
+
+ conn = rxrpc_alloc_connection(gfp);
+ if (!conn) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ conn->params = *cp;
+ conn->out_clientflag = RXRPC_CLIENT_INITIATED;
+ conn->state = RXRPC_CONN_CLIENT;
+
+ ret = rxrpc_get_client_connection_id(conn, gfp);
+ if (ret < 0)
+ goto error_0;
+
+ ret = rxrpc_init_client_conn_security(conn);
+ if (ret < 0)
+ goto error_1;
+
+ ret = conn->security->prime_packet_security(conn);
+ if (ret < 0)
+ goto error_2;
+
+ write_lock(&rxrpc_connection_lock);
+ list_add_tail(&conn->link, &rxrpc_connections);
+ write_unlock(&rxrpc_connection_lock);
+
+ /* We steal the caller's peer ref. */
+ cp->peer = NULL;
+ rxrpc_get_local(conn->params.local);
+ key_get(conn->params.key);
+
+ _leave(" = %p", conn);
+ return conn;
+
+error_2:
+ conn->security->clear(conn);
+error_1:
+ rxrpc_put_client_connection_id(conn);
+error_0:
+ kfree(conn);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * find a connection for a call
+ * - called in process context with IRQs enabled
+ */
+int rxrpc_connect_call(struct rxrpc_call *call,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ struct rxrpc_connection *conn, *candidate = NULL;
+ struct rxrpc_local *local = cp->local;
+ struct rb_node *p, **pp, *parent;
+ long diff;
+ int chan;
+
+ DECLARE_WAITQUEUE(myself, current);
+
+ _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
+
+ cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp);
+ if (!cp->peer)
+ return -ENOMEM;
+
+ if (!cp->exclusive) {
+ /* Search for a existing client connection unless this is going
+ * to be a connection that's used exclusively for a single call.
+ */
+ _debug("search 1");
+ spin_lock(&local->client_conns_lock);
+ p = local->client_conns.rb_node;
+ while (p) {
+ conn = rb_entry(p, struct rxrpc_connection, client_node);
+
+#define cmp(X) ((long)conn->params.X - (long)cp->X)
+ diff = (cmp(peer) ?:
+ cmp(key) ?:
+ cmp(security_level));
+ if (diff < 0)
+ p = p->rb_left;
+ else if (diff > 0)
+ p = p->rb_right;
+ else
+ goto found_extant_conn;
+ }
+ spin_unlock(&local->client_conns_lock);
+ }
+
+ /* We didn't find a connection or we want an exclusive one. */
+ _debug("get new conn");
+ candidate = rxrpc_alloc_client_connection(cp, gfp);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ if (cp->exclusive) {
+ /* Assign the call on an exclusive connection to channel 0 and
+ * don't add the connection to the endpoint's shareable conn
+ * lookup tree.
+ */
+ _debug("exclusive chan 0");
+ conn = candidate;
+ atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1);
+ spin_lock(&conn->channel_lock);
+ chan = 0;
+ goto found_channel;
+ }
+
+ /* We need to redo the search before attempting to add a new connection
+ * lest we race with someone else adding a conflicting instance.
+ */
+ _debug("search 2");
+ spin_lock(&local->client_conns_lock);
+
+ pp = &local->client_conns.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ conn = rb_entry(parent, struct rxrpc_connection, client_node);
+
+ diff = (cmp(peer) ?:
+ cmp(key) ?:
+ cmp(security_level));
+ if (diff < 0)
+ pp = &(*pp)->rb_left;
+ else if (diff > 0)
+ pp = &(*pp)->rb_right;
+ else
+ goto found_extant_conn;
+ }
+
+ /* The second search also failed; simply add the new connection with
+ * the new call in channel 0. Note that we need to take the channel
+ * lock before dropping the client conn lock.
+ */
+ _debug("new conn");
+ set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
+ rb_link_node(&candidate->client_node, parent, pp);
+ rb_insert_color(&candidate->client_node, &local->client_conns);
+attached:
+ conn = candidate;
+ candidate = NULL;
+
+ atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1);
+ spin_lock(&conn->channel_lock);
+ spin_unlock(&local->client_conns_lock);
+ chan = 0;
+
+found_channel:
+ _debug("found chan");
+ call->conn = conn;
+ call->channel = chan;
+ call->epoch = conn->proto.epoch;
+ call->cid = conn->proto.cid | chan;
+ call->call_id = ++conn->channels[chan].call_counter;
+ conn->channels[chan].call_id = call->call_id;
+ rcu_assign_pointer(conn->channels[chan].call, call);
+
+ _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id);
+
+ spin_unlock(&conn->channel_lock);
+ rxrpc_put_peer(cp->peer);
+ cp->peer = NULL;
+ _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
+ return 0;
+
+ /* We found a potentially suitable connection already in existence. If
+ * we can reuse it (ie. its usage count hasn't been reduced to 0 by the
+ * reaper), discard any candidate we may have allocated, and try to get
+ * a channel on this one, otherwise we have to replace it.
+ */
+found_extant_conn:
+ _debug("found conn");
+ if (!rxrpc_get_connection_maybe(conn)) {
+ set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
+ rb_replace_node(&conn->client_node,
+ &candidate->client_node,
+ &local->client_conns);
+ clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags);
+ goto attached;
+ }
+
+ spin_unlock(&local->client_conns_lock);
+
+ rxrpc_put_connection(candidate);
+
+ if (!atomic_add_unless(&conn->avail_chans, -1, 0)) {
+ if (!gfpflags_allow_blocking(gfp)) {
+ rxrpc_put_connection(conn);
+ _leave(" = -EAGAIN");
+ return -EAGAIN;
+ }
+
+ add_wait_queue(&conn->channel_wq, &myself);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_add_unless(&conn->avail_chans, -1, 0))
+ break;
+ if (signal_pending(current))
+ goto interrupted;
+ schedule();
+ }
+ remove_wait_queue(&conn->channel_wq, &myself);
+ __set_current_state(TASK_RUNNING);
+ }
+
+ /* The connection allegedly now has a free channel and we can now
+ * attach the call to it.
+ */
+ spin_lock(&conn->channel_lock);
+
+ for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
+ if (!conn->channels[chan].call)
+ goto found_channel;
+ BUG();
+
+interrupted:
+ remove_wait_queue(&conn->channel_wq, &myself);
+ __set_current_state(TASK_RUNNING);
+ rxrpc_put_connection(conn);
+ rxrpc_put_peer(cp->peer);
+ cp->peer = NULL;
+ _leave(" = -ERESTARTSYS");
+ return -ERESTARTSYS;
+}
+
+/*
+ * Remove a client connection from the local endpoint's tree, thereby removing
+ * it as a target for reuse for new client calls.
+ */
+void rxrpc_unpublish_client_conn(struct rxrpc_connection *conn)
+{
+ struct rxrpc_local *local = conn->params.local;
+
+ spin_lock(&local->client_conns_lock);
+ if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags))
+ rb_erase(&conn->client_node, &local->client_conns);
+ spin_unlock(&local->client_conns_lock);
+
+ rxrpc_put_client_connection_id(conn);
+}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
new file mode 100644
index 000000000..cee0f35bc
--- /dev/null
+++ b/net/rxrpc/conn_event.c
@@ -0,0 +1,394 @@
+/* connection-level event handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+/*
+ * pass a connection-level abort onto all calls on that connection
+ */
+static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
+ u32 abort_code)
+{
+ struct rxrpc_call *call;
+ int i;
+
+ _enter("{%d},%x", conn->debug_id, abort_code);
+
+ spin_lock(&conn->channel_lock);
+
+ for (i = 0; i < RXRPC_MAXCALLS; i++) {
+ call = rcu_dereference_protected(
+ conn->channels[i].call,
+ lockdep_is_held(&conn->channel_lock));
+ write_lock_bh(&call->state_lock);
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = state;
+ if (state == RXRPC_CALL_LOCALLY_ABORTED) {
+ call->local_abort = conn->local_abort;
+ set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
+ } else {
+ call->remote_abort = conn->remote_abort;
+ set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
+ }
+ rxrpc_queue_call(call);
+ }
+ write_unlock_bh(&call->state_lock);
+ }
+
+ spin_unlock(&conn->channel_lock);
+ _leave("");
+}
+
+/*
+ * generate a connection-level abort
+ */
+static int rxrpc_abort_connection(struct rxrpc_connection *conn,
+ u32 error, u32 abort_code)
+{
+ struct rxrpc_wire_header whdr;
+ struct msghdr msg;
+ struct kvec iov[2];
+ __be32 word;
+ size_t len;
+ u32 serial;
+ int ret;
+
+ _enter("%d,,%u,%u", conn->debug_id, error, abort_code);
+
+ /* generate a connection-level abort */
+ spin_lock_bh(&conn->state_lock);
+ if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) {
+ conn->state = RXRPC_CONN_LOCALLY_ABORTED;
+ conn->error = error;
+ spin_unlock_bh(&conn->state_lock);
+ } else {
+ spin_unlock_bh(&conn->state_lock);
+ _leave(" = 0 [already dead]");
+ return 0;
+ }
+
+ rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code);
+
+ msg.msg_name = &conn->params.peer->srx.transport;
+ msg.msg_namelen = conn->params.peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ whdr.epoch = htonl(conn->proto.epoch);
+ whdr.cid = htonl(conn->proto.cid);
+ whdr.callNumber = 0;
+ whdr.seq = 0;
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
+ whdr.flags = conn->out_clientflag;
+ whdr.userStatus = 0;
+ whdr.securityIndex = conn->security_ix;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(conn->params.service_id);
+
+ word = htonl(conn->local_abort);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = &word;
+ iov[1].iov_len = sizeof(word);
+
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ serial = atomic_inc_return(&conn->serial);
+ whdr.serial = htonl(serial);
+ _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort);
+
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+ if (ret < 0) {
+ _debug("sendmsg failed: %d", ret);
+ return -EAGAIN;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * mark a call as being on a now-secured channel
+ * - must be called with softirqs disabled
+ */
+static void rxrpc_call_is_secure(struct rxrpc_call *call)
+{
+ _enter("%p", call);
+ if (call) {
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock(&call->state_lock);
+ }
+}
+
+/*
+ * connection-level Rx packet processor
+ */
+static int rxrpc_process_event(struct rxrpc_connection *conn,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ __be32 wtmp;
+ u32 abort_code;
+ int loop, ret;
+
+ if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
+ kleave(" = -ECONNABORTED [%u]", conn->state);
+ return -ECONNABORTED;
+ }
+
+ _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial);
+
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_ABORT:
+ if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
+ return -EPROTO;
+ abort_code = ntohl(wtmp);
+ _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
+
+ conn->state = RXRPC_CONN_REMOTELY_ABORTED;
+ rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
+ abort_code);
+ return -ECONNABORTED;
+
+ case RXRPC_PACKET_TYPE_CHALLENGE:
+ return conn->security->respond_to_challenge(conn, skb,
+ _abort_code);
+
+ case RXRPC_PACKET_TYPE_RESPONSE:
+ ret = conn->security->verify_response(conn, skb, _abort_code);
+ if (ret < 0)
+ return ret;
+
+ ret = conn->security->init_connection_security(conn);
+ if (ret < 0)
+ return ret;
+
+ ret = conn->security->prime_packet_security(conn);
+ if (ret < 0)
+ return ret;
+
+ spin_lock(&conn->channel_lock);
+ spin_lock(&conn->state_lock);
+
+ if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
+ conn->state = RXRPC_CONN_SERVICE;
+ for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
+ rxrpc_call_is_secure(
+ rcu_dereference_protected(
+ conn->channels[loop].call,
+ lockdep_is_held(&conn->channel_lock)));
+ }
+
+ spin_unlock(&conn->state_lock);
+ spin_unlock(&conn->channel_lock);
+ return 0;
+
+ default:
+ _leave(" = -EPROTO [%u]", sp->hdr.type);
+ return -EPROTO;
+ }
+}
+
+/*
+ * set up security and issue a challenge
+ */
+static void rxrpc_secure_connection(struct rxrpc_connection *conn)
+{
+ u32 abort_code;
+ int ret;
+
+ _enter("{%d}", conn->debug_id);
+
+ ASSERT(conn->security_ix != 0);
+
+ if (!conn->params.key) {
+ _debug("set up security");
+ ret = rxrpc_init_server_conn_security(conn);
+ switch (ret) {
+ case 0:
+ break;
+ case -ENOENT:
+ abort_code = RX_CALL_DEAD;
+ goto abort;
+ default:
+ abort_code = RXKADNOAUTH;
+ goto abort;
+ }
+ }
+
+ if (conn->security->issue_challenge(conn) < 0) {
+ abort_code = RX_CALL_DEAD;
+ ret = -ENOMEM;
+ goto abort;
+ }
+
+ _leave("");
+ return;
+
+abort:
+ _debug("abort %d, %d", ret, abort_code);
+ rxrpc_abort_connection(conn, -ret, abort_code);
+ _leave(" [aborted]");
+}
+
+/*
+ * connection-level event processor
+ */
+void rxrpc_process_connection(struct work_struct *work)
+{
+ struct rxrpc_connection *conn =
+ container_of(work, struct rxrpc_connection, processor);
+ struct sk_buff *skb;
+ u32 abort_code = RX_PROTOCOL_ERROR;
+ int ret;
+
+ _enter("{%d}", conn->debug_id);
+
+ if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
+ rxrpc_secure_connection(conn);
+
+ /* go through the conn-level event packets, releasing the ref on this
+ * connection that each one has when we've finished with it */
+ while ((skb = skb_dequeue(&conn->rx_queue))) {
+ ret = rxrpc_process_event(conn, skb, &abort_code);
+ switch (ret) {
+ case -EPROTO:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ goto protocol_error;
+ case -EAGAIN:
+ goto requeue_and_leave;
+ case -ECONNABORTED:
+ default:
+ rxrpc_free_skb(skb);
+ break;
+ }
+ }
+
+out:
+ rxrpc_put_connection(conn);
+ _leave("");
+ return;
+
+requeue_and_leave:
+ skb_queue_head(&conn->rx_queue, skb);
+ goto out;
+
+protocol_error:
+ if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
+ goto requeue_and_leave;
+ rxrpc_free_skb(skb);
+ _leave(" [EPROTO]");
+ goto out;
+}
+
+/*
+ * put a packet up for transport-level abort
+ */
+void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
+{
+ CHECK_SLAB_OKAY(&local->usage);
+
+ skb_queue_tail(&local->reject_queue, skb);
+ rxrpc_queue_local(local);
+}
+
+/*
+ * reject packets through the local endpoint
+ */
+void rxrpc_reject_packets(struct rxrpc_local *local)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+ } sa;
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_wire_header whdr;
+ struct sk_buff *skb;
+ struct msghdr msg;
+ struct kvec iov[2];
+ size_t size;
+ __be32 code;
+
+ _enter("%d", local->debug_id);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = &code;
+ iov[1].iov_len = sizeof(code);
+ size = sizeof(whdr) + sizeof(code);
+
+ msg.msg_name = &sa;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa.sa_family = local->srx.transport.family;
+ switch (sa.sa.sa_family) {
+ case AF_INET:
+ msg.msg_namelen = sizeof(sa.sin);
+ break;
+ default:
+ msg.msg_namelen = 0;
+ break;
+ }
+
+ memset(&whdr, 0, sizeof(whdr));
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
+
+ while ((skb = skb_dequeue(&local->reject_queue))) {
+ sp = rxrpc_skb(skb);
+ switch (sa.sa.sa_family) {
+ case AF_INET:
+ sa.sin.sin_port = udp_hdr(skb)->source;
+ sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+ code = htonl(skb->priority);
+
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.serviceId = htons(sp->hdr.serviceId);
+ whdr.flags = sp->hdr.flags;
+ whdr.flags ^= RXRPC_CLIENT_INITIATED;
+ whdr.flags &= RXRPC_CLIENT_INITIATED;
+
+ kernel_sendmsg(local->socket, &msg, iov, 2, size);
+ break;
+
+ default:
+ break;
+ }
+
+ rxrpc_free_skb(skb);
+ }
+
+ _leave("");
+}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
new file mode 100644
index 000000000..896d84493
--- /dev/null
+++ b/net/rxrpc/conn_object.c
@@ -0,0 +1,340 @@
+/* RxRPC virtual connection handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * Time till a connection expires after last use (in seconds).
+ */
+unsigned int rxrpc_connection_expiry = 10 * 60;
+
+static void rxrpc_connection_reaper(struct work_struct *work);
+
+LIST_HEAD(rxrpc_connections);
+DEFINE_RWLOCK(rxrpc_connection_lock);
+static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper);
+
+/*
+ * allocate a new connection
+ */
+struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
+{
+ struct rxrpc_connection *conn;
+
+ _enter("");
+
+ conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
+ if (conn) {
+ spin_lock_init(&conn->channel_lock);
+ init_waitqueue_head(&conn->channel_wq);
+ INIT_WORK(&conn->processor, &rxrpc_process_connection);
+ INIT_LIST_HEAD(&conn->link);
+ skb_queue_head_init(&conn->rx_queue);
+ conn->security = &rxrpc_no_security;
+ spin_lock_init(&conn->state_lock);
+ /* We maintain an extra ref on the connection whilst it is
+ * on the rxrpc_connections list.
+ */
+ atomic_set(&conn->usage, 2);
+ conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ atomic_set(&conn->avail_chans, RXRPC_MAXCALLS);
+ conn->size_align = 4;
+ conn->header_size = sizeof(struct rxrpc_wire_header);
+ }
+
+ _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
+ return conn;
+}
+
+/*
+ * Look up a connection in the cache by protocol parameters.
+ *
+ * If successful, a pointer to the connection is returned, but no ref is taken.
+ * NULL is returned if there is no match.
+ *
+ * The caller must be holding the RCU read lock.
+ */
+struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
+ struct sk_buff *skb)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_conn_proto k;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_peer *peer;
+
+ _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK);
+
+ if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
+ goto not_found;
+
+ k.epoch = sp->hdr.epoch;
+ k.cid = sp->hdr.cid & RXRPC_CIDMASK;
+
+ /* We may have to handle mixing IPv4 and IPv6 */
+ if (srx.transport.family != local->srx.transport.family) {
+ pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n",
+ srx.transport.family,
+ local->srx.transport.family);
+ goto not_found;
+ }
+
+ k.epoch = sp->hdr.epoch;
+ k.cid = sp->hdr.cid & RXRPC_CIDMASK;
+
+ if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) {
+ /* We need to look up service connections by the full protocol
+ * parameter set. We look up the peer first as an intermediate
+ * step and then the connection from the peer's tree.
+ */
+ peer = rxrpc_lookup_peer_rcu(local, &srx);
+ if (!peer)
+ goto not_found;
+ conn = rxrpc_find_service_conn_rcu(peer, skb);
+ if (!conn || atomic_read(&conn->usage) == 0)
+ goto not_found;
+ _leave(" = %p", conn);
+ return conn;
+ } else {
+ /* Look up client connections by connection ID alone as their
+ * IDs are unique for this machine.
+ */
+ conn = idr_find(&rxrpc_client_conn_ids,
+ sp->hdr.cid >> RXRPC_CIDSHIFT);
+ if (!conn || atomic_read(&conn->usage) == 0) {
+ _debug("no conn");
+ goto not_found;
+ }
+
+ if (conn->proto.epoch != k.epoch ||
+ conn->params.local != local)
+ goto not_found;
+
+ peer = conn->params.peer;
+ switch (srx.transport.family) {
+ case AF_INET:
+ if (peer->srx.transport.sin.sin_port !=
+ srx.transport.sin.sin_port ||
+ peer->srx.transport.sin.sin_addr.s_addr !=
+ srx.transport.sin.sin_addr.s_addr)
+ goto not_found;
+ break;
+ default:
+ BUG();
+ }
+
+ _leave(" = %p", conn);
+ return conn;
+ }
+
+not_found:
+ _leave(" = NULL");
+ return NULL;
+}
+
+/*
+ * Disconnect a call and clear any channel it occupies when that call
+ * terminates. The caller must hold the channel_lock and must release the
+ * call's ref on the connection.
+ */
+void __rxrpc_disconnect_call(struct rxrpc_call *call)
+{
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_channel *chan = &conn->channels[call->channel];
+
+ _enter("%d,%d", conn->debug_id, call->channel);
+
+ if (rcu_access_pointer(chan->call) == call) {
+ /* Save the result of the call so that we can repeat it if necessary
+ * through the channel, whilst disposing of the actual call record.
+ */
+ chan->last_result = call->local_abort;
+ smp_wmb();
+ chan->last_call = chan->call_id;
+ chan->call_id = chan->call_counter;
+
+ rcu_assign_pointer(chan->call, NULL);
+ atomic_inc(&conn->avail_chans);
+ wake_up(&conn->channel_wq);
+ }
+
+ _leave("");
+}
+
+/*
+ * Disconnect a call and clear any channel it occupies when that call
+ * terminates.
+ */
+void rxrpc_disconnect_call(struct rxrpc_call *call)
+{
+ struct rxrpc_connection *conn = call->conn;
+
+ spin_lock(&conn->channel_lock);
+ __rxrpc_disconnect_call(call);
+ spin_unlock(&conn->channel_lock);
+
+ call->conn = NULL;
+ rxrpc_put_connection(conn);
+}
+
+/*
+ * release a virtual connection
+ */
+void rxrpc_put_connection(struct rxrpc_connection *conn)
+{
+ if (!conn)
+ return;
+
+ _enter("%p{u=%d,d=%d}",
+ conn, atomic_read(&conn->usage), conn->debug_id);
+
+ ASSERTCMP(atomic_read(&conn->usage), >, 1);
+
+ conn->put_time = ktime_get_seconds();
+ if (atomic_dec_return(&conn->usage) == 1) {
+ _debug("zombie");
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+ }
+
+ _leave("");
+}
+
+/*
+ * destroy a virtual connection
+ */
+static void rxrpc_destroy_connection(struct rcu_head *rcu)
+{
+ struct rxrpc_connection *conn =
+ container_of(rcu, struct rxrpc_connection, rcu);
+
+ _enter("{%d,u=%d}", conn->debug_id, atomic_read(&conn->usage));
+
+ ASSERTCMP(atomic_read(&conn->usage), ==, 0);
+
+ _net("DESTROY CONN %d", conn->debug_id);
+
+ rxrpc_purge_queue(&conn->rx_queue);
+
+ conn->security->clear(conn);
+ key_put(conn->params.key);
+ key_put(conn->server_key);
+ rxrpc_put_peer(conn->params.peer);
+ rxrpc_put_local(conn->params.local);
+
+ kfree(conn);
+ _leave("");
+}
+
+/*
+ * reap dead connections
+ */
+static void rxrpc_connection_reaper(struct work_struct *work)
+{
+ struct rxrpc_connection *conn, *_p;
+ unsigned long reap_older_than, earliest, put_time, now;
+
+ LIST_HEAD(graveyard);
+
+ _enter("");
+
+ now = ktime_get_seconds();
+ reap_older_than = now - rxrpc_connection_expiry;
+ earliest = ULONG_MAX;
+
+ write_lock(&rxrpc_connection_lock);
+ list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) {
+ ASSERTCMP(atomic_read(&conn->usage), >, 0);
+ if (likely(atomic_read(&conn->usage) > 1))
+ continue;
+
+ put_time = READ_ONCE(conn->put_time);
+ if (time_after(put_time, reap_older_than)) {
+ if (time_before(put_time, earliest))
+ earliest = put_time;
+ continue;
+ }
+
+ /* The usage count sits at 1 whilst the object is unused on the
+ * list; we reduce that to 0 to make the object unavailable.
+ */
+ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1)
+ continue;
+
+ if (rxrpc_conn_is_client(conn))
+ rxrpc_unpublish_client_conn(conn);
+ else
+ rxrpc_unpublish_service_conn(conn);
+
+ list_move_tail(&conn->link, &graveyard);
+ }
+ write_unlock(&rxrpc_connection_lock);
+
+ if (earliest != ULONG_MAX) {
+ _debug("reschedule reaper %ld", (long) earliest - now);
+ ASSERTCMP(earliest, >, now);
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap,
+ (earliest - now) * HZ);
+ }
+
+ while (!list_empty(&graveyard)) {
+ conn = list_entry(graveyard.next, struct rxrpc_connection,
+ link);
+ list_del_init(&conn->link);
+
+ ASSERTCMP(atomic_read(&conn->usage), ==, 0);
+ skb_queue_purge(&conn->rx_queue);
+ call_rcu(&conn->rcu, rxrpc_destroy_connection);
+ }
+
+ _leave("");
+}
+
+/*
+ * preemptively destroy all the connection records rather than waiting for them
+ * to time out
+ */
+void __exit rxrpc_destroy_all_connections(void)
+{
+ struct rxrpc_connection *conn, *_p;
+ bool leak = false;
+
+ _enter("");
+
+ rxrpc_connection_expiry = 0;
+ cancel_delayed_work(&rxrpc_connection_reap);
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+ flush_workqueue(rxrpc_workqueue);
+
+ write_lock(&rxrpc_connection_lock);
+ list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) {
+ pr_err("AF_RXRPC: Leaked conn %p {%d}\n",
+ conn, atomic_read(&conn->usage));
+ leak = true;
+ }
+ write_unlock(&rxrpc_connection_lock);
+ BUG_ON(leak);
+
+ /* Make sure the local and peer records pinned by any dying connections
+ * are released.
+ */
+ rcu_barrier();
+ rxrpc_destroy_client_conn_ids();
+
+ _leave("");
+}
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
new file mode 100644
index 000000000..fd9027ccb
--- /dev/null
+++ b/net/rxrpc/conn_service.c
@@ -0,0 +1,230 @@
+/* Service connection management
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include "ar-internal.h"
+
+/*
+ * Find a service connection under RCU conditions.
+ *
+ * We could use a hash table, but that is subject to bucket stuffing by an
+ * attacker as the client gets to pick the epoch and cid values and would know
+ * the hash function. So, instead, we use a hash table for the peer and from
+ * that an rbtree to find the service connection. Under ordinary circumstances
+ * it might be slower than a large hash table, but it is at least limited in
+ * depth.
+ */
+struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *peer,
+ struct sk_buff *skb)
+{
+ struct rxrpc_connection *conn = NULL;
+ struct rxrpc_conn_proto k;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rb_node *p;
+ unsigned int seq = 0;
+
+ k.epoch = sp->hdr.epoch;
+ k.cid = sp->hdr.cid & RXRPC_CIDMASK;
+
+ do {
+ /* Unfortunately, rbtree walking doesn't give reliable results
+ * under just the RCU read lock, so we have to check for
+ * changes.
+ */
+ read_seqbegin_or_lock(&peer->service_conn_lock, &seq);
+
+ p = rcu_dereference_raw(peer->service_conns.rb_node);
+ while (p) {
+ conn = rb_entry(p, struct rxrpc_connection, service_node);
+
+ if (conn->proto.index_key < k.index_key)
+ p = rcu_dereference_raw(p->rb_left);
+ else if (conn->proto.index_key > k.index_key)
+ p = rcu_dereference_raw(p->rb_right);
+ else
+ goto done;
+ conn = NULL;
+ }
+ } while (need_seqretry(&peer->service_conn_lock, seq));
+
+done:
+ done_seqretry(&peer->service_conn_lock, seq);
+ _leave(" = %d", conn ? conn->debug_id : -1);
+ return conn;
+}
+
+/*
+ * Insert a service connection into a peer's tree, thereby making it a target
+ * for incoming packets.
+ */
+static struct rxrpc_connection *
+rxrpc_publish_service_conn(struct rxrpc_peer *peer,
+ struct rxrpc_connection *conn)
+{
+ struct rxrpc_connection *cursor = NULL;
+ struct rxrpc_conn_proto k = conn->proto;
+ struct rb_node **pp, *parent;
+
+ write_seqlock_bh(&peer->service_conn_lock);
+
+ pp = &peer->service_conns.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ cursor = rb_entry(parent,
+ struct rxrpc_connection, service_node);
+
+ if (cursor->proto.index_key < k.index_key)
+ pp = &(*pp)->rb_left;
+ else if (cursor->proto.index_key > k.index_key)
+ pp = &(*pp)->rb_right;
+ else
+ goto found_extant_conn;
+ }
+
+ rb_link_node_rcu(&conn->service_node, parent, pp);
+ rb_insert_color(&conn->service_node, &peer->service_conns);
+conn_published:
+ set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags);
+ write_sequnlock_bh(&peer->service_conn_lock);
+ _leave(" = %d [new]", conn->debug_id);
+ return conn;
+
+found_extant_conn:
+ if (atomic_read(&cursor->usage) == 0)
+ goto replace_old_connection;
+ write_sequnlock_bh(&peer->service_conn_lock);
+ /* We should not be able to get here. rxrpc_incoming_connection() is
+ * called in a non-reentrant context, so there can't be a race to
+ * insert a new connection.
+ */
+ BUG();
+
+replace_old_connection:
+ /* The old connection is from an outdated epoch. */
+ _debug("replace conn");
+ rb_replace_node_rcu(&cursor->service_node,
+ &conn->service_node,
+ &peer->service_conns);
+ clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &cursor->flags);
+ goto conn_published;
+}
+
+/*
+ * get a record of an incoming connection
+ */
+struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local,
+ struct sockaddr_rxrpc *srx,
+ struct sk_buff *skb)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_peer *peer;
+ const char *new = "old";
+
+ _enter("");
+
+ peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
+ if (!peer) {
+ _debug("no peer");
+ return ERR_PTR(-EBUSY);
+ }
+
+ ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED);
+
+ rcu_read_lock();
+ peer = rxrpc_lookup_peer_rcu(local, srx);
+ if (peer) {
+ conn = rxrpc_find_service_conn_rcu(peer, skb);
+ if (conn) {
+ if (sp->hdr.securityIndex != conn->security_ix)
+ goto security_mismatch_rcu;
+ if (rxrpc_get_connection_maybe(conn))
+ goto found_extant_connection_rcu;
+
+ /* The conn has expired but we can't remove it without
+ * the appropriate lock, so we attempt to replace it
+ * when we have a new candidate.
+ */
+ }
+
+ if (!rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ }
+ rcu_read_unlock();
+
+ if (!peer) {
+ peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
+ if (!peer)
+ goto enomem;
+ }
+
+ /* We don't have a matching record yet. */
+ conn = rxrpc_alloc_connection(GFP_NOIO);
+ if (!conn)
+ goto enomem_peer;
+
+ conn->proto.epoch = sp->hdr.epoch;
+ conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK;
+ conn->params.local = local;
+ conn->params.peer = peer;
+ conn->params.service_id = sp->hdr.serviceId;
+ conn->security_ix = sp->hdr.securityIndex;
+ conn->out_clientflag = 0;
+ conn->state = RXRPC_CONN_SERVICE;
+ if (conn->params.service_id)
+ conn->state = RXRPC_CONN_SERVICE_UNSECURED;
+
+ rxrpc_get_local(local);
+
+ write_lock(&rxrpc_connection_lock);
+ list_add_tail(&conn->link, &rxrpc_connections);
+ write_unlock(&rxrpc_connection_lock);
+
+ /* Make the connection a target for incoming packets. */
+ rxrpc_publish_service_conn(peer, conn);
+
+ new = "new";
+
+success:
+ _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid);
+ _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
+ return conn;
+
+found_extant_connection_rcu:
+ rcu_read_unlock();
+ goto success;
+
+security_mismatch_rcu:
+ rcu_read_unlock();
+ _leave(" = -EKEYREJECTED");
+ return ERR_PTR(-EKEYREJECTED);
+
+enomem_peer:
+ rxrpc_put_peer(peer);
+enomem:
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Remove the service connection from the peer's tree, thereby removing it as a
+ * target for incoming packets.
+ */
+void rxrpc_unpublish_service_conn(struct rxrpc_connection *conn)
+{
+ struct rxrpc_peer *peer = conn->params.peer;
+
+ write_seqlock_bh(&peer->service_conn_lock);
+ if (test_and_clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags))
+ rb_erase(&conn->service_node, &peer->service_conns);
+ write_sequnlock_bh(&peer->service_conn_lock);
+}
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
new file mode 100644
index 000000000..70bb77818
--- /dev/null
+++ b/net/rxrpc/input.c
@@ -0,0 +1,757 @@
+/* RxRPC packet reception
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/net_namespace.h>
+#include "ar-internal.h"
+
+/*
+ * queue a packet for recvmsg to pass to userspace
+ * - the caller must hold a lock on call->lock
+ * - must not be called with interrupts disabled (sk_filter() disables BH's)
+ * - eats the packet whether successful or not
+ * - there must be just one reference to the packet, which the caller passes to
+ * this function
+ */
+int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
+ bool force, bool terminal)
+{
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_sock *rx = call->socket;
+ struct sock *sk;
+ int ret;
+
+ _enter(",,%d,%d", force, terminal);
+
+ ASSERT(!irqs_disabled());
+
+ sp = rxrpc_skb(skb);
+ ASSERTCMP(sp->call, ==, call);
+
+ /* if we've already posted the terminal message for a call, then we
+ * don't post any more */
+ if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
+ _debug("already terminated");
+ ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
+ rxrpc_free_skb(skb);
+ return 0;
+ }
+
+ sk = &rx->sk;
+
+ if (!force) {
+ /* cast skb->rcvbuf to unsigned... It's pointless, but
+ * reduces number of warnings when compiling with -W
+ * --ANK */
+// ret = -ENOBUFS;
+// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+// (unsigned int) sk->sk_rcvbuf)
+// goto out;
+
+ ret = sk_filter(sk, skb);
+ if (ret < 0)
+ goto out;
+ }
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) &&
+ !test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ call->socket->sk.sk_state != RXRPC_CLOSE) {
+ skb->destructor = rxrpc_packet_destructor;
+ skb->dev = NULL;
+ skb->sk = sk;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+ if (terminal) {
+ _debug("<<<< TERMINAL MESSAGE >>>>");
+ set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags);
+ }
+
+ /* allow interception by a kernel service */
+ if (rx->interceptor) {
+ rx->interceptor(sk, call->user_call_ID, skb);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ } else {
+ _net("post skb %p", skb);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ }
+ skb = NULL;
+ } else {
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+ ret = 0;
+
+out:
+ rxrpc_free_skb(skb);
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * process a DATA packet, posting the packet to the appropriate queue
+ * - eats the packet if successful
+ */
+static int rxrpc_fast_process_data(struct rxrpc_call *call,
+ struct sk_buff *skb, u32 seq)
+{
+ struct rxrpc_skb_priv *sp;
+ bool terminal;
+ int ret, ackbit, ack;
+ u32 serial;
+ u8 flags;
+
+ _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq);
+
+ sp = rxrpc_skb(skb);
+ ASSERTCMP(sp->call, ==, NULL);
+ flags = sp->hdr.flags;
+ serial = sp->hdr.serial;
+
+ spin_lock(&call->lock);
+
+ if (call->state > RXRPC_CALL_COMPLETE)
+ goto discard;
+
+ ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post);
+ ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv);
+ ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten);
+
+ if (seq < call->rx_data_post) {
+ _debug("dup #%u [-%u]", seq, call->rx_data_post);
+ ack = RXRPC_ACK_DUPLICATE;
+ ret = -ENOBUFS;
+ goto discard_and_ack;
+ }
+
+ /* we may already have the packet in the out of sequence queue */
+ ackbit = seq - (call->rx_data_eaten + 1);
+ ASSERTCMP(ackbit, >=, 0);
+ if (__test_and_set_bit(ackbit, call->ackr_window)) {
+ _debug("dup oos #%u [%u,%u]",
+ seq, call->rx_data_eaten, call->rx_data_post);
+ ack = RXRPC_ACK_DUPLICATE;
+ goto discard_and_ack;
+ }
+
+ if (seq >= call->ackr_win_top) {
+ _debug("exceed #%u [%u]", seq, call->ackr_win_top);
+ __clear_bit(ackbit, call->ackr_window);
+ ack = RXRPC_ACK_EXCEEDS_WINDOW;
+ goto discard_and_ack;
+ }
+
+ if (seq == call->rx_data_expect) {
+ clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags);
+ call->rx_data_expect++;
+ } else if (seq > call->rx_data_expect) {
+ _debug("oos #%u [%u]", seq, call->rx_data_expect);
+ call->rx_data_expect = seq + 1;
+ if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) {
+ ack = RXRPC_ACK_OUT_OF_SEQUENCE;
+ goto enqueue_and_ack;
+ }
+ goto enqueue_packet;
+ }
+
+ if (seq != call->rx_data_post) {
+ _debug("ahead #%u [%u]", seq, call->rx_data_post);
+ goto enqueue_packet;
+ }
+
+ if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags))
+ goto protocol_error;
+
+ /* if the packet need security things doing to it, then it goes down
+ * the slow path */
+ if (call->conn->security_ix)
+ goto enqueue_packet;
+
+ sp->call = call;
+ rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
+ terminal = ((flags & RXRPC_LAST_PACKET) &&
+ !(flags & RXRPC_CLIENT_INITIATED));
+ ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOBUFS) {
+ __clear_bit(ackbit, call->ackr_window);
+ ack = RXRPC_ACK_NOSPACE;
+ goto discard_and_ack;
+ }
+ goto out;
+ }
+
+ skb = NULL;
+ sp = NULL;
+
+ _debug("post #%u", seq);
+ ASSERTCMP(call->rx_data_post, ==, seq);
+ call->rx_data_post++;
+
+ if (flags & RXRPC_LAST_PACKET)
+ set_bit(RXRPC_CALL_RCVD_LAST, &call->flags);
+
+ /* if we've reached an out of sequence packet then we need to drain
+ * that queue into the socket Rx queue now */
+ if (call->rx_data_post == call->rx_first_oos) {
+ _debug("drain rx oos now");
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock(&call->state_lock);
+ }
+
+ spin_unlock(&call->lock);
+ atomic_inc(&call->ackr_not_idle);
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false);
+ _leave(" = 0 [posted]");
+ return 0;
+
+protocol_error:
+ ret = -EBADMSG;
+out:
+ spin_unlock(&call->lock);
+ _leave(" = %d", ret);
+ return ret;
+
+discard_and_ack:
+ _debug("discard and ACK packet %p", skb);
+ __rxrpc_propose_ACK(call, ack, serial, true);
+discard:
+ spin_unlock(&call->lock);
+ rxrpc_free_skb(skb);
+ _leave(" = 0 [discarded]");
+ return 0;
+
+enqueue_and_ack:
+ __rxrpc_propose_ACK(call, ack, serial, true);
+enqueue_packet:
+ _net("defer skb %p", skb);
+ spin_unlock(&call->lock);
+ skb_queue_tail(&call->rx_queue, skb);
+ atomic_inc(&call->ackr_not_idle);
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD)
+ rxrpc_queue_call(call);
+ read_unlock(&call->state_lock);
+ _leave(" = 0 [queued]");
+ return 0;
+}
+
+/*
+ * assume an implicit ACKALL of the transmission phase of a client socket upon
+ * reception of the first reply packet
+ */
+static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
+{
+ write_lock_bh(&call->state_lock);
+
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ call->acks_latest = serial;
+
+ _debug("implicit ACKALL %%%u", call->acks_latest);
+ set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events);
+ write_unlock_bh(&call->state_lock);
+
+ if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
+ clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_EV_RESEND, &call->events);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ }
+ break;
+
+ default:
+ write_unlock_bh(&call->state_lock);
+ break;
+ }
+}
+
+/*
+ * post an incoming packet to the nominated call to deal with
+ * - must get rid of the sk_buff, either by freeing it or by queuing it
+ */
+void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ __be32 wtmp;
+ u32 hi_serial, abort_code;
+
+ _enter("%p,%p", call, skb);
+
+ ASSERT(!irqs_disabled());
+
+#if 0 // INJECT RX ERROR
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
+ static int skip = 0;
+ if (++skip == 3) {
+ printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n");
+ skip = 0;
+ goto free_packet;
+ }
+ }
+#endif
+
+ /* track the latest serial number on this connection for ACK packet
+ * information */
+ hi_serial = atomic_read(&call->conn->hi_serial);
+ while (sp->hdr.serial > hi_serial)
+ hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial,
+ sp->hdr.serial);
+
+ /* request ACK generation for any ACK or DATA packet that requests
+ * it */
+ if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
+ _proto("ACK Requested on %%%u", sp->hdr.serial);
+ rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false);
+ }
+
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_ABORT:
+ _debug("abort");
+
+ if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
+ goto protocol_error;
+
+ abort_code = ntohl(wtmp);
+ _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
+
+ write_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_REMOTELY_ABORTED;
+ call->remote_abort = abort_code;
+ set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+ goto free_packet_unlock;
+
+ case RXRPC_PACKET_TYPE_BUSY:
+ _proto("Rx BUSY %%%u", sp->hdr.serial);
+
+ if (rxrpc_conn_is_service(call->conn))
+ goto protocol_error;
+
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ call->state = RXRPC_CALL_SERVER_BUSY;
+ set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
+ rxrpc_queue_call(call);
+ case RXRPC_CALL_SERVER_BUSY:
+ goto free_packet_unlock;
+ default:
+ goto protocol_error_locked;
+ }
+
+ default:
+ _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
+ goto protocol_error;
+
+ case RXRPC_PACKET_TYPE_DATA:
+ _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
+
+ if (sp->hdr.seq == 0)
+ goto protocol_error;
+
+ call->ackr_prev_seq = sp->hdr.seq;
+
+ /* received data implicitly ACKs all of the request packets we
+ * sent when we're acting as a client */
+ if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
+ rxrpc_assume_implicit_ackall(call, sp->hdr.serial);
+
+ switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) {
+ case 0:
+ skb = NULL;
+ goto done;
+
+ default:
+ BUG();
+
+ /* data packet received beyond the last packet */
+ case -EBADMSG:
+ goto protocol_error;
+ }
+
+ case RXRPC_PACKET_TYPE_ACKALL:
+ case RXRPC_PACKET_TYPE_ACK:
+ /* ACK processing is done in process context */
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD) {
+ skb_queue_tail(&call->rx_queue, skb);
+ rxrpc_queue_call(call);
+ skb = NULL;
+ }
+ read_unlock_bh(&call->state_lock);
+ goto free_packet;
+ }
+
+protocol_error:
+ _debug("protocol error");
+ write_lock_bh(&call->state_lock);
+protocol_error_locked:
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->local_abort = RX_PROTOCOL_ERROR;
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+free_packet_unlock:
+ write_unlock_bh(&call->state_lock);
+free_packet:
+ rxrpc_free_skb(skb);
+done:
+ _leave("");
+}
+
+/*
+ * split up a jumbo data packet
+ */
+static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
+ struct sk_buff *jumbo)
+{
+ struct rxrpc_jumbo_header jhdr;
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *part;
+
+ _enter(",{%u,%u}", jumbo->data_len, jumbo->len);
+
+ sp = rxrpc_skb(jumbo);
+
+ do {
+ sp->hdr.flags &= ~RXRPC_JUMBO_PACKET;
+
+ /* make a clone to represent the first subpacket in what's left
+ * of the jumbo packet */
+ part = skb_clone(jumbo, GFP_ATOMIC);
+ if (!part) {
+ /* simply ditch the tail in the event of ENOMEM */
+ pskb_trim(jumbo, RXRPC_JUMBO_DATALEN);
+ break;
+ }
+ rxrpc_new_skb(part);
+
+ pskb_trim(part, RXRPC_JUMBO_DATALEN);
+
+ if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN))
+ goto protocol_error;
+
+ if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0)
+ goto protocol_error;
+ if (!pskb_pull(jumbo, sizeof(jhdr)))
+ BUG();
+
+ sp->hdr.seq += 1;
+ sp->hdr.serial += 1;
+ sp->hdr.flags = jhdr.flags;
+ sp->hdr._rsvd = ntohs(jhdr._rsvd);
+
+ _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1);
+
+ rxrpc_fast_process_packet(call, part);
+ part = NULL;
+
+ } while (sp->hdr.flags & RXRPC_JUMBO_PACKET);
+
+ rxrpc_fast_process_packet(call, jumbo);
+ _leave("");
+ return;
+
+protocol_error:
+ _debug("protocol error");
+ rxrpc_free_skb(part);
+ rxrpc_free_skb(jumbo);
+ write_lock_bh(&call->state_lock);
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->local_abort = RX_PROTOCOL_ERROR;
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+ write_unlock_bh(&call->state_lock);
+ _leave("");
+}
+
+/*
+ * post an incoming packet to the appropriate call/socket to deal with
+ * - must get rid of the sk_buff, either by freeing it or by queuing it
+ */
+static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp;
+
+ _enter("%p,%p", call, skb);
+
+ sp = rxrpc_skb(skb);
+
+ _debug("extant call [%d]", call->state);
+
+ read_lock(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
+ rxrpc_queue_call(call);
+ goto free_unlock;
+ }
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ case RXRPC_CALL_NETWORK_ERROR:
+ case RXRPC_CALL_DEAD:
+ goto dead_call;
+ case RXRPC_CALL_COMPLETE:
+ case RXRPC_CALL_CLIENT_FINAL_ACK:
+ /* complete server call */
+ if (rxrpc_conn_is_service(call->conn))
+ goto dead_call;
+ /* resend last packet of a completed call */
+ _debug("final ack again");
+ rxrpc_get_call(call);
+ set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
+ rxrpc_queue_call(call);
+ goto free_unlock;
+ default:
+ break;
+ }
+
+ read_unlock(&call->state_lock);
+ rxrpc_get_call(call);
+
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+ sp->hdr.flags & RXRPC_JUMBO_PACKET)
+ rxrpc_process_jumbo_packet(call, skb);
+ else
+ rxrpc_fast_process_packet(call, skb);
+
+ rxrpc_put_call(call);
+ goto done;
+
+dead_call:
+ if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
+ skb->priority = RX_CALL_DEAD;
+ rxrpc_reject_packet(call->conn->params.local, skb);
+ goto unlock;
+ }
+free_unlock:
+ rxrpc_free_skb(skb);
+unlock:
+ read_unlock(&call->state_lock);
+done:
+ _leave("");
+}
+
+/*
+ * post connection-level events to the connection
+ * - this includes challenges, responses and some aborts
+ */
+static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ _enter("%p,%p", conn, skb);
+
+ skb_queue_tail(&conn->rx_queue, skb);
+ rxrpc_queue_conn(conn);
+}
+
+/*
+ * post endpoint-level events to the local endpoint
+ * - this includes debug and version messages
+ */
+static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
+ struct sk_buff *skb)
+{
+ _enter("%p,%p", local, skb);
+
+ skb_queue_tail(&local->event_queue, skb);
+ rxrpc_queue_local(local);
+}
+
+/*
+ * Extract the wire header from a packet and translate the byte order.
+ */
+static noinline
+int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
+{
+ struct rxrpc_wire_header whdr;
+
+ /* dig out the RxRPC connection details */
+ if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0)
+ return -EBADMSG;
+ if (!pskb_pull(skb, sizeof(whdr)))
+ BUG();
+
+ memset(sp, 0, sizeof(*sp));
+ sp->hdr.epoch = ntohl(whdr.epoch);
+ sp->hdr.cid = ntohl(whdr.cid);
+ sp->hdr.callNumber = ntohl(whdr.callNumber);
+ sp->hdr.seq = ntohl(whdr.seq);
+ sp->hdr.serial = ntohl(whdr.serial);
+ sp->hdr.flags = whdr.flags;
+ sp->hdr.type = whdr.type;
+ sp->hdr.userStatus = whdr.userStatus;
+ sp->hdr.securityIndex = whdr.securityIndex;
+ sp->hdr._rsvd = ntohs(whdr._rsvd);
+ sp->hdr.serviceId = ntohs(whdr.serviceId);
+ return 0;
+}
+
+/*
+ * handle data received on the local endpoint
+ * - may be called in interrupt context
+ *
+ * The socket is locked by the caller and this prevents the socket from being
+ * shut down and the local endpoint from going away, thus sk_user_data will not
+ * be cleared until this function returns.
+ */
+void rxrpc_data_ready(struct sock *sk)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_local *local = sk->sk_user_data;
+ struct sk_buff *skb;
+ int ret;
+
+ _enter("%p", sk);
+
+ ASSERT(!irqs_disabled());
+
+ skb = skb_recv_datagram(sk, 0, 1, &ret);
+ if (!skb) {
+ if (ret == -EAGAIN)
+ return;
+ _debug("UDP socket error %d", ret);
+ return;
+ }
+
+ rxrpc_new_skb(skb);
+
+ _net("recv skb %p", skb);
+
+ /* we'll probably need to checksum it (didn't call sock_recvmsg) */
+ if (skb_checksum_complete(skb)) {
+ rxrpc_free_skb(skb);
+ __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
+ _leave(" [CSUM failed]");
+ return;
+ }
+
+ __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
+
+ /* The socket buffer we have is owned by UDP, with UDP's data all over
+ * it, but we really want our own data there.
+ */
+ skb_orphan(skb);
+ sp = rxrpc_skb(skb);
+
+ _net("Rx UDP packet from %08x:%04hu",
+ ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
+
+ /* dig out the RxRPC connection details */
+ if (rxrpc_extract_header(sp, skb) < 0)
+ goto bad_message;
+
+ _net("Rx RxRPC %s ep=%x call=%x:%x",
+ sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
+ sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
+
+ if (sp->hdr.type >= RXRPC_N_PACKET_TYPES ||
+ !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) {
+ _proto("Rx Bad Packet Type %u", sp->hdr.type);
+ goto bad_message;
+ }
+
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) {
+ rxrpc_post_packet_to_local(local, skb);
+ goto out;
+ }
+
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+ (sp->hdr.callNumber == 0 || sp->hdr.seq == 0))
+ goto bad_message;
+
+ rcu_read_lock();
+
+ conn = rxrpc_find_connection_rcu(local, skb);
+ if (!conn)
+ goto cant_route_call;
+
+ if (sp->hdr.callNumber == 0) {
+ /* Connection-level packet */
+ _debug("CONN %p {%d}", conn, conn->debug_id);
+ rxrpc_post_packet_to_conn(conn, skb);
+ } else {
+ /* Call-bound packets are routed by connection channel. */
+ unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK;
+ struct rxrpc_channel *chan = &conn->channels[channel];
+ struct rxrpc_call *call = rcu_dereference(chan->call);
+
+ if (!call || atomic_read(&call->usage) == 0)
+ goto cant_route_call;
+
+ rxrpc_post_packet_to_call(call, skb);
+ }
+
+ rcu_read_unlock();
+out:
+ return;
+
+cant_route_call:
+ rcu_read_unlock();
+
+ _debug("can't route call");
+ if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
+ sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
+ if (sp->hdr.seq == 1) {
+ _debug("first packet");
+ skb_queue_tail(&local->accept_queue, skb);
+ rxrpc_queue_work(&local->processor);
+ _leave(" [incoming]");
+ return;
+ }
+ skb->priority = RX_INVALID_OPERATION;
+ } else {
+ skb->priority = RX_CALL_DEAD;
+ }
+
+ if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
+ _debug("reject type %d",sp->hdr.type);
+ rxrpc_reject_packet(local, skb);
+ } else {
+ rxrpc_free_skb(skb);
+ }
+ _leave(" [no call]");
+ return;
+
+bad_message:
+ skb->priority = RX_PROTOCOL_ERROR;
+ rxrpc_reject_packet(local, skb);
+ _leave(" [badmsg]");
+}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index e57140361..c21ad213b 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -17,11 +17,12 @@ static int none_init_connection_security(struct rxrpc_connection *conn)
return 0;
}
-static void none_prime_packet_security(struct rxrpc_connection *conn)
+static int none_prime_packet_security(struct rxrpc_connection *conn)
{
+ return 0;
}
-static int none_secure_packet(const struct rxrpc_call *call,
+static int none_secure_packet(struct rxrpc_call *call,
struct sk_buff *skb,
size_t data_size,
void *sechdr)
@@ -29,7 +30,7 @@ static int none_secure_packet(const struct rxrpc_call *call,
return 0;
}
-static int none_verify_packet(const struct rxrpc_call *call,
+static int none_verify_packet(struct rxrpc_call *call,
struct sk_buff *skb,
u32 *_abort_code)
{
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
new file mode 100644
index 000000000..18c737a61
--- /dev/null
+++ b/net/rxrpc/key.c
@@ -0,0 +1,1237 @@
+/* RxRPC key management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * RxRPC keys should have a description of describing their purpose:
+ * "afs@CAMBRIDGE.REDHAT.COM>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/skcipher.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/key-type.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#include <keys/user-type.h>
+#include "ar-internal.h"
+
+static int rxrpc_vet_description_s(const char *);
+static int rxrpc_preparse(struct key_preparsed_payload *);
+static int rxrpc_preparse_s(struct key_preparsed_payload *);
+static void rxrpc_free_preparse(struct key_preparsed_payload *);
+static void rxrpc_free_preparse_s(struct key_preparsed_payload *);
+static void rxrpc_destroy(struct key *);
+static void rxrpc_destroy_s(struct key *);
+static void rxrpc_describe(const struct key *, struct seq_file *);
+static long rxrpc_read(const struct key *, char __user *, size_t);
+
+/*
+ * rxrpc defined keys take an arbitrary string as the description and an
+ * arbitrary blob of data as the payload
+ */
+struct key_type key_type_rxrpc = {
+ .name = "rxrpc",
+ .preparse = rxrpc_preparse,
+ .free_preparse = rxrpc_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .destroy = rxrpc_destroy,
+ .describe = rxrpc_describe,
+ .read = rxrpc_read,
+};
+EXPORT_SYMBOL(key_type_rxrpc);
+
+/*
+ * rxrpc server defined keys take "<serviceId>:<securityIndex>" as the
+ * description and an 8-byte decryption key as the payload
+ */
+struct key_type key_type_rxrpc_s = {
+ .name = "rxrpc_s",
+ .vet_description = rxrpc_vet_description_s,
+ .preparse = rxrpc_preparse_s,
+ .free_preparse = rxrpc_free_preparse_s,
+ .instantiate = generic_key_instantiate,
+ .destroy = rxrpc_destroy_s,
+ .describe = rxrpc_describe,
+};
+
+/*
+ * Vet the description for an RxRPC server key
+ */
+static int rxrpc_vet_description_s(const char *desc)
+{
+ unsigned long num;
+ char *p;
+
+ num = simple_strtoul(desc, &p, 10);
+ if (*p != ':' || num > 65535)
+ return -EINVAL;
+ num = simple_strtoul(p + 1, &p, 10);
+ if (*p || num < 1 || num > 255)
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * parse an RxKAD type XDR format token
+ * - the caller guarantees we have at least 4 words
+ */
+static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
+ size_t datalen,
+ const __be32 *xdr, unsigned int toklen)
+{
+ struct rxrpc_key_token *token, **pptoken;
+ size_t plen;
+ u32 tktlen;
+
+ _enter(",{%x,%x,%x,%x},%u",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+ toklen);
+
+ if (toklen <= 8 * 4)
+ return -EKEYREJECTED;
+ tktlen = ntohl(xdr[7]);
+ _debug("tktlen: %x", tktlen);
+ if (tktlen > AFSTOKEN_RK_TIX_MAX)
+ return -EKEYREJECTED;
+ if (toklen < 8 * 4 + tktlen)
+ return -EKEYREJECTED;
+
+ plen = sizeof(*token) + sizeof(*token->kad) + tktlen;
+ prep->quotalen = datalen + plen;
+
+ plen -= sizeof(*token);
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ return -ENOMEM;
+
+ token->kad = kzalloc(plen, GFP_KERNEL);
+ if (!token->kad) {
+ kfree(token);
+ return -ENOMEM;
+ }
+
+ token->security_index = RXRPC_SECURITY_RXKAD;
+ token->kad->ticket_len = tktlen;
+ token->kad->vice_id = ntohl(xdr[0]);
+ token->kad->kvno = ntohl(xdr[1]);
+ token->kad->start = ntohl(xdr[4]);
+ token->kad->expiry = ntohl(xdr[5]);
+ token->kad->primary_flag = ntohl(xdr[6]);
+ memcpy(&token->kad->session_key, &xdr[2], 8);
+ memcpy(&token->kad->ticket, &xdr[8], tktlen);
+
+ _debug("SCIX: %u", token->security_index);
+ _debug("TLEN: %u", token->kad->ticket_len);
+ _debug("EXPY: %x", token->kad->expiry);
+ _debug("KVNO: %u", token->kad->kvno);
+ _debug("PRIM: %u", token->kad->primary_flag);
+ _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x",
+ token->kad->session_key[0], token->kad->session_key[1],
+ token->kad->session_key[2], token->kad->session_key[3],
+ token->kad->session_key[4], token->kad->session_key[5],
+ token->kad->session_key[6], token->kad->session_key[7]);
+ if (token->kad->ticket_len >= 8)
+ _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x",
+ token->kad->ticket[0], token->kad->ticket[1],
+ token->kad->ticket[2], token->kad->ticket[3],
+ token->kad->ticket[4], token->kad->ticket[5],
+ token->kad->ticket[6], token->kad->ticket[7]);
+
+ /* count the number of tokens attached */
+ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);
+
+ /* attach the data */
+ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
+ *pptoken;
+ pptoken = &(*pptoken)->next)
+ continue;
+ *pptoken = token;
+ if (token->kad->expiry < prep->expiry)
+ prep->expiry = token->kad->expiry;
+
+ _leave(" = 0");
+ return 0;
+}
+
+static void rxrpc_free_krb5_principal(struct krb5_principal *princ)
+{
+ int loop;
+
+ if (princ->name_parts) {
+ for (loop = princ->n_name_parts - 1; loop >= 0; loop--)
+ kfree(princ->name_parts[loop]);
+ kfree(princ->name_parts);
+ }
+ kfree(princ->realm);
+}
+
+static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td)
+{
+ kfree(td->data);
+}
+
+/*
+ * free up an RxK5 token
+ */
+static void rxrpc_rxk5_free(struct rxk5_key *rxk5)
+{
+ int loop;
+
+ rxrpc_free_krb5_principal(&rxk5->client);
+ rxrpc_free_krb5_principal(&rxk5->server);
+ rxrpc_free_krb5_tagged(&rxk5->session);
+
+ if (rxk5->addresses) {
+ for (loop = rxk5->n_addresses - 1; loop >= 0; loop--)
+ rxrpc_free_krb5_tagged(&rxk5->addresses[loop]);
+ kfree(rxk5->addresses);
+ }
+ if (rxk5->authdata) {
+ for (loop = rxk5->n_authdata - 1; loop >= 0; loop--)
+ rxrpc_free_krb5_tagged(&rxk5->authdata[loop]);
+ kfree(rxk5->authdata);
+ }
+
+ kfree(rxk5->ticket);
+ kfree(rxk5->ticket2);
+ kfree(rxk5);
+}
+
+/*
+ * extract a krb5 principal
+ */
+static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
+ const __be32 **_xdr,
+ unsigned int *_toklen)
+{
+ const __be32 *xdr = *_xdr;
+ unsigned int toklen = *_toklen, n_parts, loop, tmp;
+
+ /* there must be at least one name, and at least #names+1 length
+ * words */
+ if (toklen <= 12)
+ return -EINVAL;
+
+ _enter(",{%x,%x,%x},%u",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen);
+
+ n_parts = ntohl(*xdr++);
+ toklen -= 4;
+ if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX)
+ return -EINVAL;
+ princ->n_name_parts = n_parts;
+
+ if (toklen <= (n_parts + 1) * 4)
+ return -EINVAL;
+
+ princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL);
+ if (!princ->name_parts)
+ return -ENOMEM;
+
+ for (loop = 0; loop < n_parts; loop++) {
+ if (toklen < 4)
+ return -EINVAL;
+ tmp = ntohl(*xdr++);
+ toklen -= 4;
+ if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX)
+ return -EINVAL;
+ if (tmp > toklen)
+ return -EINVAL;
+ princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL);
+ if (!princ->name_parts[loop])
+ return -ENOMEM;
+ memcpy(princ->name_parts[loop], xdr, tmp);
+ princ->name_parts[loop][tmp] = 0;
+ tmp = (tmp + 3) & ~3;
+ toklen -= tmp;
+ xdr += tmp >> 2;
+ }
+
+ if (toklen < 4)
+ return -EINVAL;
+ tmp = ntohl(*xdr++);
+ toklen -= 4;
+ if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX)
+ return -EINVAL;
+ if (tmp > toklen)
+ return -EINVAL;
+ princ->realm = kmalloc(tmp + 1, GFP_KERNEL);
+ if (!princ->realm)
+ return -ENOMEM;
+ memcpy(princ->realm, xdr, tmp);
+ princ->realm[tmp] = 0;
+ tmp = (tmp + 3) & ~3;
+ toklen -= tmp;
+ xdr += tmp >> 2;
+
+ _debug("%s/...@%s", princ->name_parts[0], princ->realm);
+
+ *_xdr = xdr;
+ *_toklen = toklen;
+ _leave(" = 0 [toklen=%u]", toklen);
+ return 0;
+}
+
+/*
+ * extract a piece of krb5 tagged data
+ */
+static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td,
+ size_t max_data_size,
+ const __be32 **_xdr,
+ unsigned int *_toklen)
+{
+ const __be32 *xdr = *_xdr;
+ unsigned int toklen = *_toklen, len;
+
+ /* there must be at least one tag and one length word */
+ if (toklen <= 8)
+ return -EINVAL;
+
+ _enter(",%zu,{%x,%x},%u",
+ max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen);
+
+ td->tag = ntohl(*xdr++);
+ len = ntohl(*xdr++);
+ toklen -= 8;
+ if (len > max_data_size)
+ return -EINVAL;
+ td->data_len = len;
+
+ if (len > 0) {
+ td->data = kmemdup(xdr, len, GFP_KERNEL);
+ if (!td->data)
+ return -ENOMEM;
+ len = (len + 3) & ~3;
+ toklen -= len;
+ xdr += len >> 2;
+ }
+
+ _debug("tag %x len %x", td->tag, td->data_len);
+
+ *_xdr = xdr;
+ *_toklen = toklen;
+ _leave(" = 0 [toklen=%u]", toklen);
+ return 0;
+}
+
+/*
+ * extract an array of tagged data
+ */
+static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td,
+ u8 *_n_elem,
+ u8 max_n_elem,
+ size_t max_elem_size,
+ const __be32 **_xdr,
+ unsigned int *_toklen)
+{
+ struct krb5_tagged_data *td;
+ const __be32 *xdr = *_xdr;
+ unsigned int toklen = *_toklen, n_elem, loop;
+ int ret;
+
+ /* there must be at least one count */
+ if (toklen < 4)
+ return -EINVAL;
+
+ _enter(",,%u,%zu,{%x},%u",
+ max_n_elem, max_elem_size, ntohl(xdr[0]), toklen);
+
+ n_elem = ntohl(*xdr++);
+ toklen -= 4;
+ if (n_elem > max_n_elem)
+ return -EINVAL;
+ *_n_elem = n_elem;
+ if (n_elem > 0) {
+ if (toklen <= (n_elem + 1) * 4)
+ return -EINVAL;
+
+ _debug("n_elem %d", n_elem);
+
+ td = kcalloc(n_elem, sizeof(struct krb5_tagged_data),
+ GFP_KERNEL);
+ if (!td)
+ return -ENOMEM;
+ *_td = td;
+
+ for (loop = 0; loop < n_elem; loop++) {
+ ret = rxrpc_krb5_decode_tagged_data(&td[loop],
+ max_elem_size,
+ &xdr, &toklen);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ *_xdr = xdr;
+ *_toklen = toklen;
+ _leave(" = 0 [toklen=%u]", toklen);
+ return 0;
+}
+
+/*
+ * extract a krb5 ticket
+ */
+static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
+ const __be32 **_xdr, unsigned int *_toklen)
+{
+ const __be32 *xdr = *_xdr;
+ unsigned int toklen = *_toklen, len;
+
+ /* there must be at least one length word */
+ if (toklen <= 4)
+ return -EINVAL;
+
+ _enter(",{%x},%u", ntohl(xdr[0]), toklen);
+
+ len = ntohl(*xdr++);
+ toklen -= 4;
+ if (len > AFSTOKEN_K5_TIX_MAX)
+ return -EINVAL;
+ *_tktlen = len;
+
+ _debug("ticket len %u", len);
+
+ if (len > 0) {
+ *_ticket = kmemdup(xdr, len, GFP_KERNEL);
+ if (!*_ticket)
+ return -ENOMEM;
+ len = (len + 3) & ~3;
+ toklen -= len;
+ xdr += len >> 2;
+ }
+
+ *_xdr = xdr;
+ *_toklen = toklen;
+ _leave(" = 0 [toklen=%u]", toklen);
+ return 0;
+}
+
+/*
+ * parse an RxK5 type XDR format token
+ * - the caller guarantees we have at least 4 words
+ */
+static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep,
+ size_t datalen,
+ const __be32 *xdr, unsigned int toklen)
+{
+ struct rxrpc_key_token *token, **pptoken;
+ struct rxk5_key *rxk5;
+ const __be32 *end_xdr = xdr + (toklen >> 2);
+ int ret;
+
+ _enter(",{%x,%x,%x,%x},%u",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+ toklen);
+
+ /* reserve some payload space for this subkey - the length of the token
+ * is a reasonable approximation */
+ prep->quotalen = datalen + toklen;
+
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ return -ENOMEM;
+
+ rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL);
+ if (!rxk5) {
+ kfree(token);
+ return -ENOMEM;
+ }
+
+ token->security_index = RXRPC_SECURITY_RXK5;
+ token->k5 = rxk5;
+
+ /* extract the principals */
+ ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+ ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ /* extract the session key and the encoding type (the tag field ->
+ * ENCTYPE_xxx) */
+ ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ if (toklen < 4 * 8 + 2 * 4)
+ goto inval;
+ rxk5->authtime = be64_to_cpup((const __be64 *) xdr);
+ xdr += 2;
+ rxk5->starttime = be64_to_cpup((const __be64 *) xdr);
+ xdr += 2;
+ rxk5->endtime = be64_to_cpup((const __be64 *) xdr);
+ xdr += 2;
+ rxk5->renew_till = be64_to_cpup((const __be64 *) xdr);
+ xdr += 2;
+ rxk5->is_skey = ntohl(*xdr++);
+ rxk5->flags = ntohl(*xdr++);
+ toklen -= 4 * 8 + 2 * 4;
+
+ _debug("times: a=%llx s=%llx e=%llx rt=%llx",
+ rxk5->authtime, rxk5->starttime, rxk5->endtime,
+ rxk5->renew_till);
+ _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags);
+
+ /* extract the permitted client addresses */
+ ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses,
+ &rxk5->n_addresses,
+ AFSTOKEN_K5_ADDRESSES_MAX,
+ AFSTOKEN_DATA_MAX,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+ /* extract the tickets */
+ ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+ ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+ /* extract the typed auth data */
+ ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata,
+ &rxk5->n_authdata,
+ AFSTOKEN_K5_AUTHDATA_MAX,
+ AFSTOKEN_BDATALN_MAX,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+ if (toklen != 0)
+ goto inval;
+
+ /* attach the payload */
+ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
+ *pptoken;
+ pptoken = &(*pptoken)->next)
+ continue;
+ *pptoken = token;
+ if (token->kad->expiry < prep->expiry)
+ prep->expiry = token->kad->expiry;
+
+ _leave(" = 0");
+ return 0;
+
+inval:
+ ret = -EINVAL;
+error:
+ rxrpc_rxk5_free(rxk5);
+ kfree(token);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * attempt to parse the data as the XDR format
+ * - the caller guarantees we have more than 7 words
+ */
+static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
+{
+ const __be32 *xdr = prep->data, *token;
+ const char *cp;
+ unsigned int len, tmp, loop, ntoken, toklen, sec_ix;
+ size_t datalen = prep->datalen;
+ int ret;
+
+ _enter(",{%x,%x,%x,%x},%zu",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+ prep->datalen);
+
+ if (datalen > AFSTOKEN_LENGTH_MAX)
+ goto not_xdr;
+
+ /* XDR is an array of __be32's */
+ if (datalen & 3)
+ goto not_xdr;
+
+ /* the flags should be 0 (the setpag bit must be handled by
+ * userspace) */
+ if (ntohl(*xdr++) != 0)
+ goto not_xdr;
+ datalen -= 4;
+
+ /* check the cell name */
+ len = ntohl(*xdr++);
+ if (len < 1 || len > AFSTOKEN_CELL_MAX)
+ goto not_xdr;
+ datalen -= 4;
+ tmp = (len + 3) & ~3;
+ if (tmp > datalen)
+ goto not_xdr;
+
+ cp = (const char *) xdr;
+ for (loop = 0; loop < len; loop++)
+ if (!isprint(cp[loop]))
+ goto not_xdr;
+ if (len < tmp)
+ for (; loop < tmp; loop++)
+ if (cp[loop])
+ goto not_xdr;
+ _debug("cellname: [%u/%u] '%*.*s'",
+ len, tmp, len, len, (const char *) xdr);
+ datalen -= tmp;
+ xdr += tmp >> 2;
+
+ /* get the token count */
+ if (datalen < 12)
+ goto not_xdr;
+ ntoken = ntohl(*xdr++);
+ datalen -= 4;
+ _debug("ntoken: %x", ntoken);
+ if (ntoken < 1 || ntoken > AFSTOKEN_MAX)
+ goto not_xdr;
+
+ /* check each token wrapper */
+ token = xdr;
+ loop = ntoken;
+ do {
+ if (datalen < 8)
+ goto not_xdr;
+ toklen = ntohl(*xdr++);
+ sec_ix = ntohl(*xdr);
+ datalen -= 4;
+ _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix);
+ if (toklen < 20 || toklen > datalen)
+ goto not_xdr;
+ datalen -= (toklen + 3) & ~3;
+ xdr += (toklen + 3) >> 2;
+
+ } while (--loop > 0);
+
+ _debug("remainder: %zu", datalen);
+ if (datalen != 0)
+ goto not_xdr;
+
+ /* okay: we're going to assume it's valid XDR format
+ * - we ignore the cellname, relying on the key to be correctly named
+ */
+ do {
+ xdr = token;
+ toklen = ntohl(*xdr++);
+ token = xdr + ((toklen + 3) >> 2);
+ sec_ix = ntohl(*xdr++);
+ toklen -= 4;
+
+ _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token);
+
+ switch (sec_ix) {
+ case RXRPC_SECURITY_RXKAD:
+ ret = rxrpc_preparse_xdr_rxkad(prep, datalen, xdr, toklen);
+ if (ret != 0)
+ goto error;
+ break;
+
+ case RXRPC_SECURITY_RXK5:
+ ret = rxrpc_preparse_xdr_rxk5(prep, datalen, xdr, toklen);
+ if (ret != 0)
+ goto error;
+ break;
+
+ default:
+ ret = -EPROTONOSUPPORT;
+ goto error;
+ }
+
+ } while (--ntoken > 0);
+
+ _leave(" = 0");
+ return 0;
+
+not_xdr:
+ _leave(" = -EPROTO");
+ return -EPROTO;
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Preparse an rxrpc defined key.
+ *
+ * Data should be of the form:
+ * OFFSET LEN CONTENT
+ * 0 4 key interface version number
+ * 4 2 security index (type)
+ * 6 2 ticket length
+ * 8 4 key expiry time (time_t)
+ * 12 4 kvno
+ * 16 8 session key
+ * 24 [len] ticket
+ *
+ * if no data is provided, then a no-security key is made
+ */
+static int rxrpc_preparse(struct key_preparsed_payload *prep)
+{
+ const struct rxrpc_key_data_v1 *v1;
+ struct rxrpc_key_token *token, **pp;
+ size_t plen;
+ u32 kver;
+ int ret;
+
+ _enter("%zu", prep->datalen);
+
+ /* handle a no-security key */
+ if (!prep->data && prep->datalen == 0)
+ return 0;
+
+ /* determine if the XDR payload format is being used */
+ if (prep->datalen > 7 * 4) {
+ ret = rxrpc_preparse_xdr(prep);
+ if (ret != -EPROTO)
+ return ret;
+ }
+
+ /* get the key interface version number */
+ ret = -EINVAL;
+ if (prep->datalen <= 4 || !prep->data)
+ goto error;
+ memcpy(&kver, prep->data, sizeof(kver));
+ prep->data += sizeof(kver);
+ prep->datalen -= sizeof(kver);
+
+ _debug("KEY I/F VERSION: %u", kver);
+
+ ret = -EKEYREJECTED;
+ if (kver != 1)
+ goto error;
+
+ /* deal with a version 1 key */
+ ret = -EINVAL;
+ if (prep->datalen < sizeof(*v1))
+ goto error;
+
+ v1 = prep->data;
+ if (prep->datalen != sizeof(*v1) + v1->ticket_length)
+ goto error;
+
+ _debug("SCIX: %u", v1->security_index);
+ _debug("TLEN: %u", v1->ticket_length);
+ _debug("EXPY: %x", v1->expiry);
+ _debug("KVNO: %u", v1->kvno);
+ _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x",
+ v1->session_key[0], v1->session_key[1],
+ v1->session_key[2], v1->session_key[3],
+ v1->session_key[4], v1->session_key[5],
+ v1->session_key[6], v1->session_key[7]);
+ if (v1->ticket_length >= 8)
+ _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x",
+ v1->ticket[0], v1->ticket[1],
+ v1->ticket[2], v1->ticket[3],
+ v1->ticket[4], v1->ticket[5],
+ v1->ticket[6], v1->ticket[7]);
+
+ ret = -EPROTONOSUPPORT;
+ if (v1->security_index != RXRPC_SECURITY_RXKAD)
+ goto error;
+
+ plen = sizeof(*token->kad) + v1->ticket_length;
+ prep->quotalen = plen + sizeof(*token);
+
+ ret = -ENOMEM;
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ goto error;
+ token->kad = kzalloc(plen, GFP_KERNEL);
+ if (!token->kad)
+ goto error_free;
+
+ token->security_index = RXRPC_SECURITY_RXKAD;
+ token->kad->ticket_len = v1->ticket_length;
+ token->kad->expiry = v1->expiry;
+ token->kad->kvno = v1->kvno;
+ memcpy(&token->kad->session_key, &v1->session_key, 8);
+ memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length);
+
+ /* count the number of tokens attached */
+ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);
+
+ /* attach the data */
+ pp = (struct rxrpc_key_token **)&prep->payload.data[0];
+ while (*pp)
+ pp = &(*pp)->next;
+ *pp = token;
+ if (token->kad->expiry < prep->expiry)
+ prep->expiry = token->kad->expiry;
+ token = NULL;
+ ret = 0;
+
+error_free:
+ kfree(token);
+error:
+ return ret;
+}
+
+/*
+ * Free token list.
+ */
+static void rxrpc_free_token_list(struct rxrpc_key_token *token)
+{
+ struct rxrpc_key_token *next;
+
+ for (; token; token = next) {
+ next = token->next;
+ switch (token->security_index) {
+ case RXRPC_SECURITY_RXKAD:
+ kfree(token->kad);
+ break;
+ case RXRPC_SECURITY_RXK5:
+ if (token->k5)
+ rxrpc_rxk5_free(token->k5);
+ break;
+ default:
+ pr_err("Unknown token type %x on rxrpc key\n",
+ token->security_index);
+ BUG();
+ }
+
+ kfree(token);
+ }
+}
+
+/*
+ * Clean up preparse data.
+ */
+static void rxrpc_free_preparse(struct key_preparsed_payload *prep)
+{
+ rxrpc_free_token_list(prep->payload.data[0]);
+}
+
+/*
+ * Preparse a server secret key.
+ *
+ * The data should be the 8-byte secret key.
+ */
+static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
+{
+ struct crypto_skcipher *ci;
+
+ _enter("%zu", prep->datalen);
+
+ if (prep->datalen != 8)
+ return -EINVAL;
+
+ memcpy(&prep->payload.data[2], prep->data, 8);
+
+ ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(ci)) {
+ _leave(" = %ld", PTR_ERR(ci));
+ return PTR_ERR(ci);
+ }
+
+ if (crypto_skcipher_setkey(ci, prep->data, 8) < 0)
+ BUG();
+
+ prep->payload.data[0] = ci;
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Clean up preparse data.
+ */
+static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep)
+{
+ if (prep->payload.data[0])
+ crypto_free_skcipher(prep->payload.data[0]);
+}
+
+/*
+ * dispose of the data dangling from the corpse of a rxrpc key
+ */
+static void rxrpc_destroy(struct key *key)
+{
+ rxrpc_free_token_list(key->payload.data[0]);
+}
+
+/*
+ * dispose of the data dangling from the corpse of a rxrpc key
+ */
+static void rxrpc_destroy_s(struct key *key)
+{
+ if (key->payload.data[0]) {
+ crypto_free_skcipher(key->payload.data[0]);
+ key->payload.data[0] = NULL;
+ }
+}
+
+/*
+ * describe the rxrpc key
+ */
+static void rxrpc_describe(const struct key *key, struct seq_file *m)
+{
+ seq_puts(m, key->description);
+}
+
+/*
+ * grab the security key for a socket
+ */
+int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen)
+{
+ struct key *key;
+ char *description;
+
+ _enter("");
+
+ if (optlen <= 0 || optlen > PAGE_SIZE - 1)
+ return -EINVAL;
+
+ description = memdup_user_nul(optval, optlen);
+ if (IS_ERR(description))
+ return PTR_ERR(description);
+
+ key = request_key(&key_type_rxrpc, description, NULL);
+ if (IS_ERR(key)) {
+ kfree(description);
+ _leave(" = %ld", PTR_ERR(key));
+ return PTR_ERR(key);
+ }
+
+ rx->key = key;
+ kfree(description);
+ _leave(" = 0 [key %x]", key->serial);
+ return 0;
+}
+
+/*
+ * grab the security keyring for a server socket
+ */
+int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval,
+ int optlen)
+{
+ struct key *key;
+ char *description;
+
+ _enter("");
+
+ if (optlen <= 0 || optlen > PAGE_SIZE - 1)
+ return -EINVAL;
+
+ description = memdup_user_nul(optval, optlen);
+ if (IS_ERR(description))
+ return PTR_ERR(description);
+
+ key = request_key(&key_type_keyring, description, NULL);
+ if (IS_ERR(key)) {
+ kfree(description);
+ _leave(" = %ld", PTR_ERR(key));
+ return PTR_ERR(key);
+ }
+
+ rx->securities = key;
+ kfree(description);
+ _leave(" = 0 [key %x]", key->serial);
+ return 0;
+}
+
+/*
+ * generate a server data key
+ */
+int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
+ const void *session_key,
+ time_t expiry,
+ u32 kvno)
+{
+ const struct cred *cred = current_cred();
+ struct key *key;
+ int ret;
+
+ struct {
+ u32 kver;
+ struct rxrpc_key_data_v1 v1;
+ } data;
+
+ _enter("");
+
+ key = key_alloc(&key_type_rxrpc, "x",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(key)) {
+ _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
+ return -ENOMEM;
+ }
+
+ _debug("key %d", key_serial(key));
+
+ data.kver = 1;
+ data.v1.security_index = RXRPC_SECURITY_RXKAD;
+ data.v1.ticket_length = 0;
+ data.v1.expiry = expiry;
+ data.v1.kvno = 0;
+
+ memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key));
+
+ ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL);
+ if (ret < 0)
+ goto error;
+
+ conn->params.key = key;
+ _leave(" = 0 [%d]", key_serial(key));
+ return 0;
+
+error:
+ key_revoke(key);
+ key_put(key);
+ _leave(" = -ENOMEM [ins %d]", ret);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(rxrpc_get_server_data_key);
+
+/**
+ * rxrpc_get_null_key - Generate a null RxRPC key
+ * @keyname: The name to give the key.
+ *
+ * Generate a null RxRPC key that can be used to indicate anonymous security is
+ * required for a particular domain.
+ */
+struct key *rxrpc_get_null_key(const char *keyname)
+{
+ const struct cred *cred = current_cred();
+ struct key *key;
+ int ret;
+
+ key = key_alloc(&key_type_rxrpc, keyname,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(key))
+ return key;
+
+ ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL);
+ if (ret < 0) {
+ key_revoke(key);
+ key_put(key);
+ return ERR_PTR(ret);
+ }
+
+ return key;
+}
+EXPORT_SYMBOL(rxrpc_get_null_key);
+
+/*
+ * read the contents of an rxrpc key
+ * - this returns the result in XDR form
+ */
+static long rxrpc_read(const struct key *key,
+ char __user *buffer, size_t buflen)
+{
+ const struct rxrpc_key_token *token;
+ const struct krb5_principal *princ;
+ size_t size;
+ __be32 __user *xdr, *oldxdr;
+ u32 cnlen, toksize, ntoks, tok, zero;
+ u16 toksizes[AFSTOKEN_MAX];
+ int loop;
+
+ _enter("");
+
+ /* we don't know what form we should return non-AFS keys in */
+ if (memcmp(key->description, "afs@", 4) != 0)
+ return -EOPNOTSUPP;
+ cnlen = strlen(key->description + 4);
+
+#define RND(X) (((X) + 3) & ~3)
+
+ /* AFS keys we return in XDR form, so we need to work out the size of
+ * the XDR */
+ size = 2 * 4; /* flags, cellname len */
+ size += RND(cnlen); /* cellname */
+ size += 1 * 4; /* token count */
+
+ ntoks = 0;
+ for (token = key->payload.data[0]; token; token = token->next) {
+ toksize = 4; /* sec index */
+
+ switch (token->security_index) {
+ case RXRPC_SECURITY_RXKAD:
+ toksize += 8 * 4; /* viceid, kvno, key*2, begin,
+ * end, primary, tktlen */
+ toksize += RND(token->kad->ticket_len);
+ break;
+
+ case RXRPC_SECURITY_RXK5:
+ princ = &token->k5->client;
+ toksize += 4 + princ->n_name_parts * 4;
+ for (loop = 0; loop < princ->n_name_parts; loop++)
+ toksize += RND(strlen(princ->name_parts[loop]));
+ toksize += 4 + RND(strlen(princ->realm));
+
+ princ = &token->k5->server;
+ toksize += 4 + princ->n_name_parts * 4;
+ for (loop = 0; loop < princ->n_name_parts; loop++)
+ toksize += RND(strlen(princ->name_parts[loop]));
+ toksize += 4 + RND(strlen(princ->realm));
+
+ toksize += 8 + RND(token->k5->session.data_len);
+
+ toksize += 4 * 8 + 2 * 4;
+
+ toksize += 4 + token->k5->n_addresses * 8;
+ for (loop = 0; loop < token->k5->n_addresses; loop++)
+ toksize += RND(token->k5->addresses[loop].data_len);
+
+ toksize += 4 + RND(token->k5->ticket_len);
+ toksize += 4 + RND(token->k5->ticket2_len);
+
+ toksize += 4 + token->k5->n_authdata * 8;
+ for (loop = 0; loop < token->k5->n_authdata; loop++)
+ toksize += RND(token->k5->authdata[loop].data_len);
+ break;
+
+ default: /* we have a ticket we can't encode */
+ BUG();
+ continue;
+ }
+
+ _debug("token[%u]: toksize=%u", ntoks, toksize);
+ ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX);
+
+ toksizes[ntoks++] = toksize;
+ size += toksize + 4; /* each token has a length word */
+ }
+
+#undef RND
+
+ if (!buffer || buflen < size)
+ return size;
+
+ xdr = (__be32 __user *) buffer;
+ zero = 0;
+#define ENCODE(x) \
+ do { \
+ __be32 y = htonl(x); \
+ if (put_user(y, xdr++) < 0) \
+ goto fault; \
+ } while(0)
+#define ENCODE_DATA(l, s) \
+ do { \
+ u32 _l = (l); \
+ ENCODE(l); \
+ if (copy_to_user(xdr, (s), _l) != 0) \
+ goto fault; \
+ if (_l & 3 && \
+ copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \
+ goto fault; \
+ xdr += (_l + 3) >> 2; \
+ } while(0)
+#define ENCODE64(x) \
+ do { \
+ __be64 y = cpu_to_be64(x); \
+ if (copy_to_user(xdr, &y, 8) != 0) \
+ goto fault; \
+ xdr += 8 >> 2; \
+ } while(0)
+#define ENCODE_STR(s) \
+ do { \
+ const char *_s = (s); \
+ ENCODE_DATA(strlen(_s), _s); \
+ } while(0)
+
+ ENCODE(0); /* flags */
+ ENCODE_DATA(cnlen, key->description + 4); /* cellname */
+ ENCODE(ntoks);
+
+ tok = 0;
+ for (token = key->payload.data[0]; token; token = token->next) {
+ toksize = toksizes[tok++];
+ ENCODE(toksize);
+ oldxdr = xdr;
+ ENCODE(token->security_index);
+
+ switch (token->security_index) {
+ case RXRPC_SECURITY_RXKAD:
+ ENCODE(token->kad->vice_id);
+ ENCODE(token->kad->kvno);
+ ENCODE_DATA(8, token->kad->session_key);
+ ENCODE(token->kad->start);
+ ENCODE(token->kad->expiry);
+ ENCODE(token->kad->primary_flag);
+ ENCODE_DATA(token->kad->ticket_len, token->kad->ticket);
+ break;
+
+ case RXRPC_SECURITY_RXK5:
+ princ = &token->k5->client;
+ ENCODE(princ->n_name_parts);
+ for (loop = 0; loop < princ->n_name_parts; loop++)
+ ENCODE_STR(princ->name_parts[loop]);
+ ENCODE_STR(princ->realm);
+
+ princ = &token->k5->server;
+ ENCODE(princ->n_name_parts);
+ for (loop = 0; loop < princ->n_name_parts; loop++)
+ ENCODE_STR(princ->name_parts[loop]);
+ ENCODE_STR(princ->realm);
+
+ ENCODE(token->k5->session.tag);
+ ENCODE_DATA(token->k5->session.data_len,
+ token->k5->session.data);
+
+ ENCODE64(token->k5->authtime);
+ ENCODE64(token->k5->starttime);
+ ENCODE64(token->k5->endtime);
+ ENCODE64(token->k5->renew_till);
+ ENCODE(token->k5->is_skey);
+ ENCODE(token->k5->flags);
+
+ ENCODE(token->k5->n_addresses);
+ for (loop = 0; loop < token->k5->n_addresses; loop++) {
+ ENCODE(token->k5->addresses[loop].tag);
+ ENCODE_DATA(token->k5->addresses[loop].data_len,
+ token->k5->addresses[loop].data);
+ }
+
+ ENCODE_DATA(token->k5->ticket_len, token->k5->ticket);
+ ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2);
+
+ ENCODE(token->k5->n_authdata);
+ for (loop = 0; loop < token->k5->n_authdata; loop++) {
+ ENCODE(token->k5->authdata[loop].tag);
+ ENCODE_DATA(token->k5->authdata[loop].data_len,
+ token->k5->authdata[loop].data);
+ }
+ break;
+
+ default:
+ BUG();
+ break;
+ }
+
+ ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==,
+ toksize);
+ }
+
+#undef ENCODE_STR
+#undef ENCODE_DATA
+#undef ENCODE64
+#undef ENCODE
+
+ ASSERTCMP(tok, ==, ntoks);
+ ASSERTCMP((char __user *) xdr - buffer, ==, size);
+ _leave(" = %zu", size);
+ return size;
+
+fault:
+ _leave(" = -EFAULT");
+ return -EFAULT;
+}
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
new file mode 100644
index 000000000..31a3f86ef
--- /dev/null
+++ b/net/rxrpc/local_event.c
@@ -0,0 +1,116 @@
+/* AF_RXRPC local endpoint management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <generated/utsrelease.h>
+#include "ar-internal.h"
+
+static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC";
+
+/*
+ * Reply to a version request
+ */
+static void rxrpc_send_version_request(struct rxrpc_local *local,
+ struct rxrpc_host_header *hdr,
+ struct sk_buff *skb)
+{
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct sockaddr_in sin;
+ struct msghdr msg;
+ struct kvec iov[2];
+ size_t len;
+ int ret;
+
+ _enter("");
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = udp_hdr(skb)->source;
+ sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+
+ msg.msg_name = &sin;
+ msg.msg_namelen = sizeof(sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.seq = 0;
+ whdr.serial = 0;
+ whdr.type = RXRPC_PACKET_TYPE_VERSION;
+ whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
+ whdr.userStatus = 0;
+ whdr.securityIndex = 0;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(sp->hdr.serviceId);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = (char *)rxrpc_version_string;
+ iov[1].iov_len = sizeof(rxrpc_version_string);
+
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ _proto("Tx VERSION (reply)");
+
+ ret = kernel_sendmsg(local->socket, &msg, iov, 2, len);
+ if (ret < 0)
+ _debug("sendmsg failed: %d", ret);
+
+ _leave("");
+}
+
+/*
+ * Process event packets targetted at a local endpoint.
+ */
+void rxrpc_process_local_events(struct rxrpc_local *local)
+{
+ struct sk_buff *skb;
+ char v;
+
+ _enter("");
+
+ skb = skb_dequeue(&local->event_queue);
+ if (skb) {
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ _debug("{%d},{%u}", local->debug_id, sp->hdr.type);
+
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_VERSION:
+ if (skb_copy_bits(skb, 0, &v, 1) < 0)
+ return;
+ _proto("Rx VERSION { %02x }", v);
+ if (v == 0)
+ rxrpc_send_version_request(local, &sp->hdr, skb);
+ break;
+
+ default:
+ /* Just ignore anything we don't understand */
+ break;
+ }
+
+ rxrpc_free_skb(skb);
+ }
+
+ _leave("");
+}
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
new file mode 100644
index 000000000..a753796fb
--- /dev/null
+++ b/net/rxrpc/local_object.c
@@ -0,0 +1,390 @@
+/* Local endpoint object management
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <linux/hashtable.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static void rxrpc_local_processor(struct work_struct *);
+static void rxrpc_local_rcu(struct rcu_head *);
+
+static DEFINE_MUTEX(rxrpc_local_mutex);
+static LIST_HEAD(rxrpc_local_endpoints);
+
+/*
+ * Compare a local to an address. Return -ve, 0 or +ve to indicate less than,
+ * same or greater than.
+ *
+ * We explicitly don't compare the RxRPC service ID as we want to reject
+ * conflicting uses by differing services. Further, we don't want to share
+ * addresses with different options (IPv6), so we don't compare those bits
+ * either.
+ */
+static long rxrpc_local_cmp_key(const struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx)
+{
+ long diff;
+
+ diff = ((local->srx.transport_type - srx->transport_type) ?:
+ (local->srx.transport_len - srx->transport_len) ?:
+ (local->srx.transport.family - srx->transport.family));
+ if (diff != 0)
+ return diff;
+
+ switch (srx->transport.family) {
+ case AF_INET:
+ /* If the choice of UDP port is left up to the transport, then
+ * the endpoint record doesn't match.
+ */
+ return ((u16 __force)local->srx.transport.sin.sin_port -
+ (u16 __force)srx->transport.sin.sin_port) ?:
+ memcmp(&local->srx.transport.sin.sin_addr,
+ &srx->transport.sin.sin_addr,
+ sizeof(struct in_addr));
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Allocate a new local endpoint.
+ */
+static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_local *local;
+
+ local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL);
+ if (local) {
+ atomic_set(&local->usage, 1);
+ INIT_LIST_HEAD(&local->link);
+ INIT_WORK(&local->processor, rxrpc_local_processor);
+ INIT_LIST_HEAD(&local->services);
+ init_rwsem(&local->defrag_sem);
+ skb_queue_head_init(&local->accept_queue);
+ skb_queue_head_init(&local->reject_queue);
+ skb_queue_head_init(&local->event_queue);
+ local->client_conns = RB_ROOT;
+ spin_lock_init(&local->client_conns_lock);
+ spin_lock_init(&local->lock);
+ rwlock_init(&local->services_lock);
+ local->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ memcpy(&local->srx, srx, sizeof(*srx));
+ }
+
+ _leave(" = %p", local);
+ return local;
+}
+
+/*
+ * create the local socket
+ * - must be called with rxrpc_local_mutex locked
+ */
+static int rxrpc_open_socket(struct rxrpc_local *local)
+{
+ struct sock *sock;
+ int ret, opt;
+
+ _enter("%p{%d}", local, local->srx.transport_type);
+
+ /* create a socket to represent the local endpoint */
+ ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type,
+ IPPROTO_UDP, &local->socket);
+ if (ret < 0) {
+ _leave(" = %d [socket]", ret);
+ return ret;
+ }
+
+ /* if a local address was supplied then bind it */
+ if (local->srx.transport_len > sizeof(sa_family_t)) {
+ _debug("bind");
+ ret = kernel_bind(local->socket,
+ (struct sockaddr *)&local->srx.transport,
+ local->srx.transport_len);
+ if (ret < 0) {
+ _debug("bind failed %d", ret);
+ goto error;
+ }
+ }
+
+ /* we want to receive ICMP errors */
+ opt = 1;
+ ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
+ (char *) &opt, sizeof(opt));
+ if (ret < 0) {
+ _debug("setsockopt failed");
+ goto error;
+ }
+
+ /* we want to set the don't fragment bit */
+ opt = IP_PMTUDISC_DO;
+ ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
+ (char *) &opt, sizeof(opt));
+ if (ret < 0) {
+ _debug("setsockopt failed");
+ goto error;
+ }
+
+ /* set the socket up */
+ sock = local->socket->sk;
+ sock->sk_user_data = local;
+ sock->sk_data_ready = rxrpc_data_ready;
+ sock->sk_error_report = rxrpc_error_report;
+ _leave(" = 0");
+ return 0;
+
+error:
+ kernel_sock_shutdown(local->socket, SHUT_RDWR);
+ local->socket->sk->sk_user_data = NULL;
+ sock_release(local->socket);
+ local->socket = NULL;
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Look up or create a new local endpoint using the specified local address.
+ */
+struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_local *local;
+ struct list_head *cursor;
+ const char *age;
+ long diff;
+ int ret;
+
+ if (srx->transport.family == AF_INET) {
+ _enter("{%d,%u,%pI4+%hu}",
+ srx->transport_type,
+ srx->transport.family,
+ &srx->transport.sin.sin_addr,
+ ntohs(srx->transport.sin.sin_port));
+ } else {
+ _enter("{%d,%u}",
+ srx->transport_type,
+ srx->transport.family);
+ return ERR_PTR(-EAFNOSUPPORT);
+ }
+
+ mutex_lock(&rxrpc_local_mutex);
+
+ for (cursor = rxrpc_local_endpoints.next;
+ cursor != &rxrpc_local_endpoints;
+ cursor = cursor->next) {
+ local = list_entry(cursor, struct rxrpc_local, link);
+
+ diff = rxrpc_local_cmp_key(local, srx);
+ if (diff < 0)
+ continue;
+ if (diff > 0)
+ break;
+
+ /* Services aren't allowed to share transport sockets, so
+ * reject that here. It is possible that the object is dying -
+ * but it may also still have the local transport address that
+ * we want bound.
+ */
+ if (srx->srx_service) {
+ local = NULL;
+ goto addr_in_use;
+ }
+
+ /* Found a match. We replace a dying object. Attempting to
+ * bind the transport socket may still fail if we're attempting
+ * to use a local address that the dying object is still using.
+ */
+ if (!rxrpc_get_local_maybe(local)) {
+ cursor = cursor->next;
+ list_del_init(&local->link);
+ break;
+ }
+
+ age = "old";
+ goto found;
+ }
+
+ local = rxrpc_alloc_local(srx);
+ if (!local)
+ goto nomem;
+
+ ret = rxrpc_open_socket(local);
+ if (ret < 0)
+ goto sock_error;
+
+ list_add_tail(&local->link, cursor);
+ age = "new";
+
+found:
+ mutex_unlock(&rxrpc_local_mutex);
+
+ _net("LOCAL %s %d {%d,%u,%pI4+%hu}",
+ age,
+ local->debug_id,
+ local->srx.transport_type,
+ local->srx.transport.family,
+ &local->srx.transport.sin.sin_addr,
+ ntohs(local->srx.transport.sin.sin_port));
+
+ _leave(" = %p", local);
+ return local;
+
+nomem:
+ ret = -ENOMEM;
+sock_error:
+ mutex_unlock(&rxrpc_local_mutex);
+ kfree(local);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+
+addr_in_use:
+ mutex_unlock(&rxrpc_local_mutex);
+ _leave(" = -EADDRINUSE");
+ return ERR_PTR(-EADDRINUSE);
+}
+
+/*
+ * A local endpoint reached its end of life.
+ */
+void __rxrpc_put_local(struct rxrpc_local *local)
+{
+ _enter("%d", local->debug_id);
+ rxrpc_queue_work(&local->processor);
+}
+
+/*
+ * Destroy a local endpoint's socket and then hand the record to RCU to dispose
+ * of.
+ *
+ * Closing the socket cannot be done from bottom half context or RCU callback
+ * context because it might sleep.
+ */
+static void rxrpc_local_destroyer(struct rxrpc_local *local)
+{
+ struct socket *socket = local->socket;
+
+ _enter("%d", local->debug_id);
+
+ /* We can get a race between an incoming call packet queueing the
+ * processor again and the work processor starting the destruction
+ * process which will shut down the UDP socket.
+ */
+ if (local->dead) {
+ _leave(" [already dead]");
+ return;
+ }
+ local->dead = true;
+
+ mutex_lock(&rxrpc_local_mutex);
+ list_del_init(&local->link);
+ mutex_unlock(&rxrpc_local_mutex);
+
+ ASSERT(RB_EMPTY_ROOT(&local->client_conns));
+ ASSERT(list_empty(&local->services));
+
+ if (socket) {
+ local->socket = NULL;
+ kernel_sock_shutdown(socket, SHUT_RDWR);
+ socket->sk->sk_user_data = NULL;
+ sock_release(socket);
+ }
+
+ /* At this point, there should be no more packets coming in to the
+ * local endpoint.
+ */
+ rxrpc_purge_queue(&local->accept_queue);
+ rxrpc_purge_queue(&local->reject_queue);
+ rxrpc_purge_queue(&local->event_queue);
+
+ _debug("rcu local %d", local->debug_id);
+ call_rcu(&local->rcu, rxrpc_local_rcu);
+}
+
+/*
+ * Process events on an endpoint
+ */
+static void rxrpc_local_processor(struct work_struct *work)
+{
+ struct rxrpc_local *local =
+ container_of(work, struct rxrpc_local, processor);
+ bool again;
+
+ _enter("%d", local->debug_id);
+
+ do {
+ again = false;
+ if (atomic_read(&local->usage) == 0)
+ return rxrpc_local_destroyer(local);
+
+ if (!skb_queue_empty(&local->accept_queue)) {
+ rxrpc_accept_incoming_calls(local);
+ again = true;
+ }
+
+ if (!skb_queue_empty(&local->reject_queue)) {
+ rxrpc_reject_packets(local);
+ again = true;
+ }
+
+ if (!skb_queue_empty(&local->event_queue)) {
+ rxrpc_process_local_events(local);
+ again = true;
+ }
+ } while (again);
+}
+
+/*
+ * Destroy a local endpoint after the RCU grace period expires.
+ */
+static void rxrpc_local_rcu(struct rcu_head *rcu)
+{
+ struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu);
+
+ _enter("%d", local->debug_id);
+
+ ASSERT(!work_pending(&local->processor));
+
+ _net("DESTROY LOCAL %d", local->debug_id);
+ kfree(local);
+ _leave("");
+}
+
+/*
+ * Verify the local endpoint list is empty by this point.
+ */
+void __exit rxrpc_destroy_all_locals(void)
+{
+ struct rxrpc_local *local;
+
+ _enter("");
+
+ flush_workqueue(rxrpc_workqueue);
+
+ if (!list_empty(&rxrpc_local_endpoints)) {
+ mutex_lock(&rxrpc_local_mutex);
+ list_for_each_entry(local, &rxrpc_local_endpoints, link) {
+ pr_err("AF_RXRPC: Leaked local %p {%d}\n",
+ local, atomic_read(&local->usage));
+ }
+ mutex_unlock(&rxrpc_local_mutex);
+ BUG();
+ }
+
+ rcu_barrier();
+}
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 1afe9876e..bdc5e42fe 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -15,6 +15,12 @@
#include "ar-internal.h"
/*
+ * The maximum listening backlog queue size that may be set on a socket by
+ * listen().
+ */
+unsigned int rxrpc_max_backlog __read_mostly = 10;
+
+/*
* How long to wait before scheduling ACK generation after seeing a
* packet with RXRPC_REQUEST_ACK set (in jiffies).
*/
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
new file mode 100644
index 000000000..f4bda06b7
--- /dev/null
+++ b/net/rxrpc/output.c
@@ -0,0 +1,721 @@
+/* RxRPC packet transmission
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/circ_buf.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * Time till packet resend (in jiffies).
+ */
+unsigned int rxrpc_resend_timeout = 4 * HZ;
+
+static int rxrpc_send_data(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct msghdr *msg, size_t len);
+
+/*
+ * extract control messages from the sendmsg() control buffer
+ */
+static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
+ unsigned long *user_call_ID,
+ enum rxrpc_command *command,
+ u32 *abort_code,
+ bool *_exclusive)
+{
+ struct cmsghdr *cmsg;
+ bool got_user_ID = false;
+ int len;
+
+ *command = RXRPC_CMD_SEND_DATA;
+
+ if (msg->msg_controllen == 0)
+ return -EINVAL;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ _debug("CMSG %d, %d, %d",
+ cmsg->cmsg_level, cmsg->cmsg_type, len);
+
+ if (cmsg->cmsg_level != SOL_RXRPC)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RXRPC_USER_CALL_ID:
+ if (msg->msg_flags & MSG_CMSG_COMPAT) {
+ if (len != sizeof(u32))
+ return -EINVAL;
+ *user_call_ID = *(u32 *) CMSG_DATA(cmsg);
+ } else {
+ if (len != sizeof(unsigned long))
+ return -EINVAL;
+ *user_call_ID = *(unsigned long *)
+ CMSG_DATA(cmsg);
+ }
+ _debug("User Call ID %lx", *user_call_ID);
+ got_user_ID = true;
+ break;
+
+ case RXRPC_ABORT:
+ if (*command != RXRPC_CMD_SEND_DATA)
+ return -EINVAL;
+ *command = RXRPC_CMD_SEND_ABORT;
+ if (len != sizeof(*abort_code))
+ return -EINVAL;
+ *abort_code = *(unsigned int *) CMSG_DATA(cmsg);
+ _debug("Abort %x", *abort_code);
+ if (*abort_code == 0)
+ return -EINVAL;
+ break;
+
+ case RXRPC_ACCEPT:
+ if (*command != RXRPC_CMD_SEND_DATA)
+ return -EINVAL;
+ *command = RXRPC_CMD_ACCEPT;
+ if (len != 0)
+ return -EINVAL;
+ break;
+
+ case RXRPC_EXCLUSIVE_CALL:
+ *_exclusive = true;
+ if (len != 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (!got_user_ID)
+ return -EINVAL;
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * abort a call, sending an ABORT packet to the peer
+ */
+static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
+{
+ write_lock_bh(&call->state_lock);
+
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->local_abort = abort_code;
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ del_timer_sync(&call->resend_timer);
+ del_timer_sync(&call->ack_timer);
+ clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_EV_ACK, &call->events);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ rxrpc_queue_call(call);
+ }
+
+ write_unlock_bh(&call->state_lock);
+}
+
+/*
+ * Create a new client call for sendmsg().
+ */
+static struct rxrpc_call *
+rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
+ unsigned long user_call_ID, bool exclusive)
+{
+ struct rxrpc_conn_parameters cp;
+ struct rxrpc_call *call;
+ struct key *key;
+
+ DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name);
+
+ _enter("");
+
+ if (!msg->msg_name)
+ return ERR_PTR(-EDESTADDRREQ);
+
+ key = rx->key;
+ if (key && !rx->key->payload.data[0])
+ key = NULL;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.local = rx->local;
+ cp.key = rx->key;
+ cp.security_level = rx->min_sec_level;
+ cp.exclusive = rx->exclusive | exclusive;
+ cp.service_id = srx->srx_service;
+ call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
+
+ _leave(" = %p\n", call);
+ return call;
+}
+
+/*
+ * send a message forming part of a client call through an RxRPC socket
+ * - caller holds the socket locked
+ * - the socket may be either a client socket or a server socket
+ */
+int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
+{
+ enum rxrpc_command cmd;
+ struct rxrpc_call *call;
+ unsigned long user_call_ID = 0;
+ bool exclusive = false;
+ u32 abort_code = 0;
+ int ret;
+
+ _enter("");
+
+ ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code,
+ &exclusive);
+ if (ret < 0)
+ return ret;
+
+ if (cmd == RXRPC_CMD_ACCEPT) {
+ if (rx->sk.sk_state != RXRPC_SERVER_LISTENING)
+ return -EINVAL;
+ call = rxrpc_accept_call(rx, user_call_ID);
+ if (IS_ERR(call))
+ return PTR_ERR(call);
+ rxrpc_put_call(call);
+ return 0;
+ }
+
+ call = rxrpc_find_call_by_user_ID(rx, user_call_ID);
+ if (!call) {
+ if (cmd != RXRPC_CMD_SEND_DATA)
+ return -EBADSLT;
+ call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID,
+ exclusive);
+ if (IS_ERR(call))
+ return PTR_ERR(call);
+ }
+
+ _debug("CALL %d USR %lx ST %d on CONN %p",
+ call->debug_id, call->user_call_ID, call->state, call->conn);
+
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ /* it's too late for this call */
+ ret = -ECONNRESET;
+ } else if (cmd == RXRPC_CMD_SEND_ABORT) {
+ rxrpc_send_abort(call, abort_code);
+ ret = 0;
+ } else if (cmd != RXRPC_CMD_SEND_DATA) {
+ ret = -EINVAL;
+ } else if (!call->in_clientflag &&
+ call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
+ /* request phase complete for this client call */
+ ret = -EPROTO;
+ } else if (call->in_clientflag &&
+ call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+ /* Reply phase not begun or not complete for service call. */
+ ret = -EPROTO;
+ } else {
+ ret = rxrpc_send_data(rx, call, msg, len);
+ }
+
+ rxrpc_put_call(call);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * rxrpc_kernel_send_data - Allow a kernel service to send data on a call
+ * @call: The call to send data through
+ * @msg: The data to send
+ * @len: The amount of data to send
+ *
+ * Allow a kernel service to send data on a call. The call must be in an state
+ * appropriate to sending data. No control data should be supplied in @msg,
+ * nor should an address be supplied. MSG_MORE should be flagged if there's
+ * more data to come, otherwise this data will end the transmission phase.
+ */
+int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg,
+ size_t len)
+{
+ int ret;
+
+ _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]);
+
+ ASSERTCMP(msg->msg_name, ==, NULL);
+ ASSERTCMP(msg->msg_control, ==, NULL);
+
+ lock_sock(&call->socket->sk);
+
+ _debug("CALL %d USR %lx ST %d on CONN %p",
+ call->debug_id, call->user_call_ID, call->state, call->conn);
+
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ ret = -ESHUTDOWN; /* it's too late for this call */
+ } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+ ret = -EPROTO; /* request phase complete for this client call */
+ } else {
+ ret = rxrpc_send_data(call->socket, call, msg, len);
+ }
+
+ release_sock(&call->socket->sk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_send_data);
+
+/**
+ * rxrpc_kernel_abort_call - Allow a kernel service to abort a call
+ * @call: The call to be aborted
+ * @abort_code: The abort code to stick into the ABORT packet
+ *
+ * Allow a kernel service to abort a call, if it's still in an abortable state.
+ */
+void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code)
+{
+ _enter("{%d},%d", call->debug_id, abort_code);
+
+ lock_sock(&call->socket->sk);
+
+ _debug("CALL %d USR %lx ST %d on CONN %p",
+ call->debug_id, call->user_call_ID, call->state, call->conn);
+
+ if (call->state < RXRPC_CALL_COMPLETE)
+ rxrpc_send_abort(call, abort_code);
+
+ release_sock(&call->socket->sk);
+ _leave("");
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_abort_call);
+
+/*
+ * send a packet through the transport endpoint
+ */
+int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
+{
+ struct kvec iov[1];
+ struct msghdr msg;
+ int ret, opt;
+
+ _enter(",{%d}", skb->len);
+
+ iov[0].iov_base = skb->head;
+ iov[0].iov_len = skb->len;
+
+ msg.msg_name = &conn->params.peer->srx.transport;
+ msg.msg_namelen = conn->params.peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ /* send the packet with the don't fragment bit set if we currently
+ * think it's small enough */
+ if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) {
+ down_read(&conn->params.local->defrag_sem);
+ /* send the packet by UDP
+ * - returns -EMSGSIZE if UDP would have to fragment the packet
+ * to go out of the interface
+ * - in which case, we'll have processed the ICMP error
+ * message and update the peer record
+ */
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
+ iov[0].iov_len);
+
+ up_read(&conn->params.local->defrag_sem);
+ if (ret == -EMSGSIZE)
+ goto send_fragmentable;
+
+ _leave(" = %d [%u]", ret, conn->params.peer->maxdata);
+ return ret;
+ }
+
+send_fragmentable:
+ /* attempt to send this message with fragmentation enabled */
+ _debug("send fragment");
+
+ down_write(&conn->params.local->defrag_sem);
+
+ switch (conn->params.local->srx.transport.family) {
+ case AF_INET:
+ opt = IP_PMTUDISC_DONT;
+ ret = kernel_setsockopt(conn->params.local->socket,
+ SOL_IP, IP_MTU_DISCOVER,
+ (char *)&opt, sizeof(opt));
+ if (ret == 0) {
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
+ iov[0].iov_len);
+
+ opt = IP_PMTUDISC_DO;
+ kernel_setsockopt(conn->params.local->socket, SOL_IP,
+ IP_MTU_DISCOVER,
+ (char *)&opt, sizeof(opt));
+ }
+ break;
+ }
+
+ up_write(&conn->params.local->defrag_sem);
+ _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
+ return ret;
+}
+
+/*
+ * wait for space to appear in the transmit/ACK window
+ * - caller holds the socket locked
+ */
+static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ long *timeo)
+{
+ DECLARE_WAITQUEUE(myself, current);
+ int ret;
+
+ _enter(",{%d},%ld",
+ CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
+ call->acks_winsz),
+ *timeo);
+
+ add_wait_queue(&call->tx_waitq, &myself);
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = 0;
+ if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
+ call->acks_winsz) > 0)
+ break;
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(*timeo);
+ break;
+ }
+
+ release_sock(&rx->sk);
+ *timeo = schedule_timeout(*timeo);
+ lock_sock(&rx->sk);
+ }
+
+ remove_wait_queue(&call->tx_waitq, &myself);
+ set_current_state(TASK_RUNNING);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * attempt to schedule an instant Tx resend
+ */
+static inline void rxrpc_instant_resend(struct rxrpc_call *call)
+{
+ read_lock_bh(&call->state_lock);
+ if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
+ rxrpc_queue_call(call);
+ }
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * queue a packet for transmission, set the resend timer and attempt
+ * to send the packet immediately
+ */
+static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ bool last)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ int ret;
+
+ _net("queue skb %p [%d]", skb, call->acks_head);
+
+ ASSERT(call->acks_window != NULL);
+ call->acks_window[call->acks_head] = (unsigned long) skb;
+ smp_wmb();
+ call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1);
+
+ if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
+ _debug("________awaiting reply/ACK__________");
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+ break;
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ call->state = RXRPC_CALL_SERVER_SEND_REPLY;
+ if (!last)
+ break;
+ case RXRPC_CALL_SERVER_SEND_REPLY:
+ call->state = RXRPC_CALL_SERVER_AWAIT_ACK;
+ break;
+ default:
+ break;
+ }
+ write_unlock_bh(&call->state_lock);
+ }
+
+ _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
+
+ sp->need_resend = false;
+ sp->resend_at = jiffies + rxrpc_resend_timeout;
+ if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) {
+ _debug("run timer");
+ call->resend_timer.expires = sp->resend_at;
+ add_timer(&call->resend_timer);
+ }
+
+ /* attempt to cancel the rx-ACK timer, deferring reply transmission if
+ * we're ACK'ing the request phase of an incoming call */
+ ret = -EAGAIN;
+ if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
+ /* the packet may be freed by rxrpc_process_call() before this
+ * returns */
+ ret = rxrpc_send_data_packet(call->conn, skb);
+ _net("sent skb %p", skb);
+ } else {
+ _debug("failed to delete ACK timer");
+ }
+
+ if (ret < 0) {
+ _debug("need instant resend %d", ret);
+ sp->need_resend = true;
+ rxrpc_instant_resend(call);
+ }
+
+ _leave("");
+}
+
+/*
+ * Convert a host-endian header into a network-endian header.
+ */
+static void rxrpc_insert_header(struct sk_buff *skb)
+{
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.seq = htonl(sp->hdr.seq);
+ whdr.serial = htonl(sp->hdr.serial);
+ whdr.type = sp->hdr.type;
+ whdr.flags = sp->hdr.flags;
+ whdr.userStatus = sp->hdr.userStatus;
+ whdr.securityIndex = sp->hdr.securityIndex;
+ whdr._rsvd = htons(sp->hdr._rsvd);
+ whdr.serviceId = htons(sp->hdr.serviceId);
+
+ memcpy(skb->head, &whdr, sizeof(whdr));
+}
+
+/*
+ * send data through a socket
+ * - must be called in process context
+ * - caller holds the socket locked
+ */
+static int rxrpc_send_data(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct msghdr *msg, size_t len)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ struct sock *sk = &rx->sk;
+ long timeo;
+ bool more;
+ int ret, copied;
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ /* this should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ return -EPIPE;
+
+ more = msg->msg_flags & MSG_MORE;
+
+ skb = call->tx_pending;
+ call->tx_pending = NULL;
+
+ copied = 0;
+ do {
+ if (!skb) {
+ size_t size, chunk, max, space;
+
+ _debug("alloc");
+
+ if (CIRC_SPACE(call->acks_head,
+ ACCESS_ONCE(call->acks_tail),
+ call->acks_winsz) <= 0) {
+ ret = -EAGAIN;
+ if (msg->msg_flags & MSG_DONTWAIT)
+ goto maybe_error;
+ ret = rxrpc_wait_for_tx_window(rx, call,
+ &timeo);
+ if (ret < 0)
+ goto maybe_error;
+ }
+
+ max = call->conn->params.peer->maxdata;
+ max -= call->conn->security_size;
+ max &= ~(call->conn->size_align - 1UL);
+
+ chunk = max;
+ if (chunk > msg_data_left(msg) && !more)
+ chunk = msg_data_left(msg);
+
+ space = chunk + call->conn->size_align;
+ space &= ~(call->conn->size_align - 1UL);
+
+ size = space + call->conn->header_size;
+
+ _debug("SIZE: %zu/%zu/%zu", chunk, space, size);
+
+ /* create a buffer that we can retain until it's ACK'd */
+ skb = sock_alloc_send_skb(
+ sk, size, msg->msg_flags & MSG_DONTWAIT, &ret);
+ if (!skb)
+ goto maybe_error;
+
+ rxrpc_new_skb(skb);
+
+ _debug("ALLOC SEND %p", skb);
+
+ ASSERTCMP(skb->mark, ==, 0);
+
+ _debug("HS: %u", call->conn->header_size);
+ skb_reserve(skb, call->conn->header_size);
+ skb->len += call->conn->header_size;
+
+ sp = rxrpc_skb(skb);
+ sp->remain = chunk;
+ if (sp->remain > skb_tailroom(skb))
+ sp->remain = skb_tailroom(skb);
+
+ _net("skb: hr %d, tr %d, hl %d, rm %d",
+ skb_headroom(skb),
+ skb_tailroom(skb),
+ skb_headlen(skb),
+ sp->remain);
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ _debug("append");
+ sp = rxrpc_skb(skb);
+
+ /* append next segment of data to the current buffer */
+ if (msg_data_left(msg) > 0) {
+ int copy = skb_tailroom(skb);
+ ASSERTCMP(copy, >, 0);
+ if (copy > msg_data_left(msg))
+ copy = msg_data_left(msg);
+ if (copy > sp->remain)
+ copy = sp->remain;
+
+ _debug("add");
+ ret = skb_add_data(skb, &msg->msg_iter, copy);
+ _debug("added");
+ if (ret < 0)
+ goto efault;
+ sp->remain -= copy;
+ skb->mark += copy;
+ copied += copy;
+ }
+
+ /* check for the far side aborting the call or a network error
+ * occurring */
+ if (call->state > RXRPC_CALL_COMPLETE)
+ goto call_aborted;
+
+ /* add the packet to the send queue if it's now full */
+ if (sp->remain <= 0 ||
+ (msg_data_left(msg) == 0 && !more)) {
+ struct rxrpc_connection *conn = call->conn;
+ uint32_t seq;
+ size_t pad;
+
+ /* pad out if we're using security */
+ if (conn->security_ix) {
+ pad = conn->security_size + skb->mark;
+ pad = conn->size_align - pad;
+ pad &= conn->size_align - 1;
+ _debug("pad %zu", pad);
+ if (pad)
+ memset(skb_put(skb, pad), 0, pad);
+ }
+
+ seq = atomic_inc_return(&call->sequence);
+
+ sp->hdr.epoch = conn->proto.epoch;
+ sp->hdr.cid = call->cid;
+ sp->hdr.callNumber = call->call_id;
+ sp->hdr.seq = seq;
+ sp->hdr.serial = atomic_inc_return(&conn->serial);
+ sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
+ sp->hdr.userStatus = 0;
+ sp->hdr.securityIndex = conn->security_ix;
+ sp->hdr._rsvd = 0;
+ sp->hdr.serviceId = call->service_id;
+
+ sp->hdr.flags = conn->out_clientflag;
+ if (msg_data_left(msg) == 0 && !more)
+ sp->hdr.flags |= RXRPC_LAST_PACKET;
+ else if (CIRC_SPACE(call->acks_head,
+ ACCESS_ONCE(call->acks_tail),
+ call->acks_winsz) > 1)
+ sp->hdr.flags |= RXRPC_MORE_PACKETS;
+ if (more && seq & 1)
+ sp->hdr.flags |= RXRPC_REQUEST_ACK;
+
+ ret = conn->security->secure_packet(
+ call, skb, skb->mark,
+ skb->head + sizeof(struct rxrpc_wire_header));
+ if (ret < 0)
+ goto out;
+
+ rxrpc_insert_header(skb);
+ rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
+ skb = NULL;
+ }
+ } while (msg_data_left(msg) > 0);
+
+success:
+ ret = copied;
+out:
+ call->tx_pending = skb;
+ _leave(" = %d", ret);
+ return ret;
+
+call_aborted:
+ rxrpc_free_skb(skb);
+ if (call->state == RXRPC_CALL_NETWORK_ERROR)
+ ret = call->error_report < RXRPC_LOCAL_ERROR_OFFSET ?
+ call->error_report :
+ call->error_report - RXRPC_LOCAL_ERROR_OFFSET;
+ else
+ ret = -ECONNABORTED;
+ _leave(" = %d", ret);
+ return ret;
+
+maybe_error:
+ if (copied)
+ goto success;
+ goto out;
+
+efault:
+ ret = -EFAULT;
+ goto out;
+}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
new file mode 100644
index 000000000..8940674b5
--- /dev/null
+++ b/net/rxrpc/peer_event.c
@@ -0,0 +1,281 @@
+/* Peer event handling, typically ICMP messages.
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *);
+
+/*
+ * Find the peer associated with an ICMP packet.
+ */
+static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
+ const struct sk_buff *skb)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ struct sockaddr_rxrpc srx;
+
+ _enter("");
+
+ memset(&srx, 0, sizeof(srx));
+ srx.transport_type = local->srx.transport_type;
+ srx.transport.family = local->srx.transport.family;
+
+ /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice
+ * versa?
+ */
+ switch (srx.transport.family) {
+ case AF_INET:
+ srx.transport.sin.sin_port = serr->port;
+ srx.transport_len = sizeof(struct sockaddr_in);
+ switch (serr->ee.ee_origin) {
+ case SO_EE_ORIGIN_ICMP:
+ _net("Rx ICMP");
+ memcpy(&srx.transport.sin.sin_addr,
+ skb_network_header(skb) + serr->addr_offset,
+ sizeof(struct in_addr));
+ break;
+ case SO_EE_ORIGIN_ICMP6:
+ _net("Rx ICMP6 on v4 sock");
+ memcpy(&srx.transport.sin.sin_addr,
+ skb_network_header(skb) + serr->addr_offset + 12,
+ sizeof(struct in_addr));
+ break;
+ default:
+ memcpy(&srx.transport.sin.sin_addr, &ip_hdr(skb)->saddr,
+ sizeof(struct in_addr));
+ break;
+ }
+ break;
+
+ default:
+ BUG();
+ }
+
+ return rxrpc_lookup_peer_rcu(local, &srx);
+}
+
+/*
+ * Handle an MTU/fragmentation problem.
+ */
+static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr)
+{
+ u32 mtu = serr->ee.ee_info;
+
+ _net("Rx ICMP Fragmentation Needed (%d)", mtu);
+
+ /* wind down the local interface MTU */
+ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) {
+ peer->if_mtu = mtu;
+ _net("I/F MTU %u", mtu);
+ }
+
+ if (mtu == 0) {
+ /* they didn't give us a size, estimate one */
+ mtu = peer->if_mtu;
+ if (mtu > 1500) {
+ mtu >>= 1;
+ if (mtu < 1500)
+ mtu = 1500;
+ } else {
+ mtu -= 100;
+ if (mtu < peer->hdrsize)
+ mtu = peer->hdrsize + 4;
+ }
+ }
+
+ if (mtu < peer->mtu) {
+ spin_lock_bh(&peer->lock);
+ peer->mtu = mtu;
+ peer->maxdata = peer->mtu - peer->hdrsize;
+ spin_unlock_bh(&peer->lock);
+ _net("Net MTU %u (maxdata %u)",
+ peer->mtu, peer->maxdata);
+ }
+}
+
+/*
+ * Handle an error received on the local endpoint.
+ */
+void rxrpc_error_report(struct sock *sk)
+{
+ struct sock_exterr_skb *serr;
+ struct rxrpc_local *local = sk->sk_user_data;
+ struct rxrpc_peer *peer;
+ struct sk_buff *skb;
+
+ _enter("%p{%d}", sk, local->debug_id);
+
+ skb = sock_dequeue_err_skb(sk);
+ if (!skb) {
+ _leave("UDP socket errqueue empty");
+ return;
+ }
+ serr = SKB_EXT_ERR(skb);
+ if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
+ _leave("UDP empty message");
+ kfree_skb(skb);
+ return;
+ }
+
+ rxrpc_new_skb(skb);
+
+ rcu_read_lock();
+ peer = rxrpc_lookup_peer_icmp_rcu(local, skb);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ if (!peer) {
+ rcu_read_unlock();
+ rxrpc_free_skb(skb);
+ _leave(" [no peer]");
+ return;
+ }
+
+ if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP &&
+ serr->ee.ee_type == ICMP_DEST_UNREACH &&
+ serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
+ rxrpc_adjust_mtu(peer, serr);
+ rcu_read_unlock();
+ rxrpc_free_skb(skb);
+ rxrpc_put_peer(peer);
+ _leave(" [MTU update]");
+ return;
+ }
+
+ rxrpc_store_error(peer, serr);
+ rcu_read_unlock();
+ rxrpc_free_skb(skb);
+
+ /* The ref we obtained is passed off to the work item */
+ rxrpc_queue_work(&peer->error_distributor);
+ _leave("");
+}
+
+/*
+ * Map an error report to error codes on the peer record.
+ */
+static void rxrpc_store_error(struct rxrpc_peer *peer,
+ struct sock_exterr_skb *serr)
+{
+ struct sock_extended_err *ee;
+ int err;
+
+ _enter("");
+
+ ee = &serr->ee;
+
+ _net("Rx Error o=%d t=%d c=%d e=%d",
+ ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno);
+
+ err = ee->ee_errno;
+
+ switch (ee->ee_origin) {
+ case SO_EE_ORIGIN_ICMP:
+ switch (ee->ee_type) {
+ case ICMP_DEST_UNREACH:
+ switch (ee->ee_code) {
+ case ICMP_NET_UNREACH:
+ _net("Rx Received ICMP Network Unreachable");
+ break;
+ case ICMP_HOST_UNREACH:
+ _net("Rx Received ICMP Host Unreachable");
+ break;
+ case ICMP_PORT_UNREACH:
+ _net("Rx Received ICMP Port Unreachable");
+ break;
+ case ICMP_NET_UNKNOWN:
+ _net("Rx Received ICMP Unknown Network");
+ break;
+ case ICMP_HOST_UNKNOWN:
+ _net("Rx Received ICMP Unknown Host");
+ break;
+ default:
+ _net("Rx Received ICMP DestUnreach code=%u",
+ ee->ee_code);
+ break;
+ }
+ break;
+
+ case ICMP_TIME_EXCEEDED:
+ _net("Rx Received ICMP TTL Exceeded");
+ break;
+
+ default:
+ _proto("Rx Received ICMP error { type=%u code=%u }",
+ ee->ee_type, ee->ee_code);
+ break;
+ }
+ break;
+
+ case SO_EE_ORIGIN_NONE:
+ case SO_EE_ORIGIN_LOCAL:
+ _proto("Rx Received local error { error=%d }", err);
+ err += RXRPC_LOCAL_ERROR_OFFSET;
+ break;
+
+ case SO_EE_ORIGIN_ICMP6:
+ default:
+ _proto("Rx Received error report { orig=%u }", ee->ee_origin);
+ break;
+ }
+
+ peer->error_report = err;
+}
+
+/*
+ * Distribute an error that occurred on a peer
+ */
+void rxrpc_peer_error_distributor(struct work_struct *work)
+{
+ struct rxrpc_peer *peer =
+ container_of(work, struct rxrpc_peer, error_distributor);
+ struct rxrpc_call *call;
+ int error_report;
+
+ _enter("");
+
+ error_report = READ_ONCE(peer->error_report);
+
+ _debug("ISSUE ERROR %d", error_report);
+
+ spin_lock_bh(&peer->lock);
+
+ while (!hlist_empty(&peer->error_targets)) {
+ call = hlist_entry(peer->error_targets.first,
+ struct rxrpc_call, error_link);
+ hlist_del_init(&call->error_link);
+
+ write_lock(&call->state_lock);
+ if (call->state != RXRPC_CALL_COMPLETE &&
+ call->state < RXRPC_CALL_NETWORK_ERROR) {
+ call->error_report = error_report;
+ call->state = RXRPC_CALL_NETWORK_ERROR;
+ set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
+ rxrpc_queue_call(call);
+ }
+ write_unlock(&call->state_lock);
+ }
+
+ spin_unlock_bh(&peer->lock);
+
+ rxrpc_put_peer(peer);
+ _leave("");
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
new file mode 100644
index 000000000..538e9831c
--- /dev/null
+++ b/net/rxrpc/peer_object.c
@@ -0,0 +1,315 @@
+/* RxRPC remote transport endpoint record management
+ *
+ * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/hashtable.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include "ar-internal.h"
+
+static DEFINE_HASHTABLE(rxrpc_peer_hash, 10);
+static DEFINE_SPINLOCK(rxrpc_peer_hash_lock);
+
+/*
+ * Hash a peer key.
+ */
+static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx)
+{
+ const u16 *p;
+ unsigned int i, size;
+ unsigned long hash_key;
+
+ _enter("");
+
+ hash_key = (unsigned long)local / __alignof__(*local);
+ hash_key += srx->transport_type;
+ hash_key += srx->transport_len;
+ hash_key += srx->transport.family;
+
+ switch (srx->transport.family) {
+ case AF_INET:
+ hash_key += (u16 __force)srx->transport.sin.sin_port;
+ size = sizeof(srx->transport.sin.sin_addr);
+ p = (u16 *)&srx->transport.sin.sin_addr;
+ break;
+ default:
+ WARN(1, "AF_RXRPC: Unsupported transport address family\n");
+ return 0;
+ }
+
+ /* Step through the peer address in 16-bit portions for speed */
+ for (i = 0; i < size; i += sizeof(*p), p++)
+ hash_key += *p;
+
+ _leave(" 0x%lx", hash_key);
+ return hash_key;
+}
+
+/*
+ * Compare a peer to a key. Return -ve, 0 or +ve to indicate less than, same
+ * or greater than.
+ *
+ * Unfortunately, the primitives in linux/hashtable.h don't allow for sorted
+ * buckets and mid-bucket insertion, so we don't make full use of this
+ * information at this point.
+ */
+static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer,
+ struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx,
+ unsigned long hash_key)
+{
+ long diff;
+
+ diff = ((peer->hash_key - hash_key) ?:
+ ((unsigned long)peer->local - (unsigned long)local) ?:
+ (peer->srx.transport_type - srx->transport_type) ?:
+ (peer->srx.transport_len - srx->transport_len) ?:
+ (peer->srx.transport.family - srx->transport.family));
+ if (diff != 0)
+ return diff;
+
+ switch (srx->transport.family) {
+ case AF_INET:
+ return ((u16 __force)peer->srx.transport.sin.sin_port -
+ (u16 __force)srx->transport.sin.sin_port) ?:
+ memcmp(&peer->srx.transport.sin.sin_addr,
+ &srx->transport.sin.sin_addr,
+ sizeof(struct in_addr));
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Look up a remote transport endpoint for the specified address using RCU.
+ */
+static struct rxrpc_peer *__rxrpc_lookup_peer_rcu(
+ struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx,
+ unsigned long hash_key)
+{
+ struct rxrpc_peer *peer;
+
+ hash_for_each_possible_rcu(rxrpc_peer_hash, peer, hash_link, hash_key) {
+ if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) {
+ if (atomic_read(&peer->usage) == 0)
+ return NULL;
+ return peer;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Look up a remote transport endpoint for the specified address using RCU.
+ */
+struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_peer *peer;
+ unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
+
+ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
+ if (peer) {
+ switch (srx->transport.family) {
+ case AF_INET:
+ _net("PEER %d {%d,%u,%pI4+%hu}",
+ peer->debug_id,
+ peer->srx.transport_type,
+ peer->srx.transport.family,
+ &peer->srx.transport.sin.sin_addr,
+ ntohs(peer->srx.transport.sin.sin_port));
+ break;
+ }
+
+ _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
+ }
+ return peer;
+}
+
+/*
+ * assess the MTU size for the network interface through which this peer is
+ * reached
+ */
+static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
+{
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ peer->if_mtu = 1500;
+
+ rt = ip_route_output_ports(&init_net, &fl4, NULL,
+ peer->srx.transport.sin.sin_addr.s_addr, 0,
+ htons(7000), htons(7001),
+ IPPROTO_UDP, 0, 0);
+ if (IS_ERR(rt)) {
+ _leave(" [route err %ld]", PTR_ERR(rt));
+ return;
+ }
+
+ peer->if_mtu = dst_mtu(&rt->dst);
+ dst_release(&rt->dst);
+
+ _leave(" [if_mtu %u]", peer->if_mtu);
+}
+
+/*
+ * Allocate a peer.
+ */
+struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
+{
+ struct rxrpc_peer *peer;
+
+ _enter("");
+
+ peer = kzalloc(sizeof(struct rxrpc_peer), gfp);
+ if (peer) {
+ atomic_set(&peer->usage, 1);
+ peer->local = local;
+ INIT_HLIST_HEAD(&peer->error_targets);
+ INIT_WORK(&peer->error_distributor,
+ &rxrpc_peer_error_distributor);
+ peer->service_conns = RB_ROOT;
+ seqlock_init(&peer->service_conn_lock);
+ spin_lock_init(&peer->lock);
+ peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ }
+
+ _leave(" = %p", peer);
+ return peer;
+}
+
+/*
+ * Set up a new peer.
+ */
+static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
+ struct sockaddr_rxrpc *srx,
+ unsigned long hash_key,
+ gfp_t gfp)
+{
+ struct rxrpc_peer *peer;
+
+ _enter("");
+
+ peer = rxrpc_alloc_peer(local, gfp);
+ if (peer) {
+ peer->hash_key = hash_key;
+ memcpy(&peer->srx, srx, sizeof(*srx));
+
+ rxrpc_assess_MTU_size(peer);
+ peer->mtu = peer->if_mtu;
+
+ if (srx->transport.family == AF_INET) {
+ peer->hdrsize = sizeof(struct iphdr);
+ switch (srx->transport_type) {
+ case SOCK_DGRAM:
+ peer->hdrsize += sizeof(struct udphdr);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ } else {
+ BUG();
+ }
+
+ peer->hdrsize += sizeof(struct rxrpc_wire_header);
+ peer->maxdata = peer->mtu - peer->hdrsize;
+ }
+
+ _leave(" = %p", peer);
+ return peer;
+}
+
+/*
+ * obtain a remote transport endpoint for the specified address
+ */
+struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
+ struct sockaddr_rxrpc *srx, gfp_t gfp)
+{
+ struct rxrpc_peer *peer, *candidate;
+ unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
+
+ _enter("{%d,%d,%pI4+%hu}",
+ srx->transport_type,
+ srx->transport_len,
+ &srx->transport.sin.sin_addr,
+ ntohs(srx->transport.sin.sin_port));
+
+ /* search the peer list first */
+ rcu_read_lock();
+ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ rcu_read_unlock();
+
+ if (!peer) {
+ /* The peer is not yet present in hash - create a candidate
+ * for a new record and then redo the search.
+ */
+ candidate = rxrpc_create_peer(local, srx, hash_key, gfp);
+ if (!candidate) {
+ _leave(" = NULL [nomem]");
+ return NULL;
+ }
+
+ spin_lock(&rxrpc_peer_hash_lock);
+
+ /* Need to check that we aren't racing with someone else */
+ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ if (!peer)
+ hash_add_rcu(rxrpc_peer_hash,
+ &candidate->hash_link, hash_key);
+
+ spin_unlock(&rxrpc_peer_hash_lock);
+
+ if (peer)
+ kfree(candidate);
+ else
+ peer = candidate;
+ }
+
+ _net("PEER %d {%d,%pI4+%hu}",
+ peer->debug_id,
+ peer->srx.transport_type,
+ &peer->srx.transport.sin.sin_addr,
+ ntohs(peer->srx.transport.sin.sin_port));
+
+ _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
+ return peer;
+}
+
+/*
+ * Discard a ref on a remote peer record.
+ */
+void __rxrpc_put_peer(struct rxrpc_peer *peer)
+{
+ ASSERT(hlist_empty(&peer->error_targets));
+
+ spin_lock(&rxrpc_peer_hash_lock);
+ hash_del_rcu(&peer->hash_link);
+ spin_unlock(&rxrpc_peer_hash_lock);
+
+ kfree_rcu(peer, rcu);
+}
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
new file mode 100644
index 000000000..ced5f0744
--- /dev/null
+++ b/net/rxrpc/proc.c
@@ -0,0 +1,192 @@
+/* /proc/net/ support for AF_RXRPC
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
+ [RXRPC_CONN_UNUSED] = "Unused ",
+ [RXRPC_CONN_CLIENT] = "Client ",
+ [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ",
+ [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ",
+ [RXRPC_CONN_SERVICE] = "SvSecure",
+ [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort",
+ [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort",
+ [RXRPC_CONN_NETWORK_ERROR] = "NetError",
+};
+
+/*
+ * generate a list of extant and dead calls in /proc/net/rxrpc_calls
+ */
+static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
+{
+ read_lock(&rxrpc_call_lock);
+ return seq_list_start_head(&rxrpc_calls, *_pos);
+}
+
+static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &rxrpc_calls, pos);
+}
+
+static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&rxrpc_call_lock);
+}
+
+static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_call *call;
+ char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+
+ if (v == &rxrpc_calls) {
+ seq_puts(seq,
+ "Proto Local Remote "
+ " SvID ConnID CallID End Use State Abort "
+ " UserID\n");
+ return 0;
+ }
+
+ call = list_entry(v, struct rxrpc_call, link);
+
+ sprintf(lbuff, "%pI4:%u",
+ &call->local->srx.transport.sin.sin_addr,
+ ntohs(call->local->srx.transport.sin.sin_port));
+
+ conn = call->conn;
+ if (conn)
+ sprintf(rbuff, "%pI4:%u",
+ &conn->params.peer->srx.transport.sin.sin_addr,
+ ntohs(conn->params.peer->srx.transport.sin.sin_port));
+ else
+ strcpy(rbuff, "no_connection");
+
+ seq_printf(seq,
+ "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u"
+ " %-8.8s %08x %lx\n",
+ lbuff,
+ rbuff,
+ call->service_id,
+ call->cid,
+ call->call_id,
+ call->in_clientflag ? "Svc" : "Clt",
+ atomic_read(&call->usage),
+ rxrpc_call_states[call->state],
+ call->remote_abort ?: call->local_abort,
+ call->user_call_ID);
+
+ return 0;
+}
+
+static const struct seq_operations rxrpc_call_seq_ops = {
+ .start = rxrpc_call_seq_start,
+ .next = rxrpc_call_seq_next,
+ .stop = rxrpc_call_seq_stop,
+ .show = rxrpc_call_seq_show,
+};
+
+static int rxrpc_call_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rxrpc_call_seq_ops);
+}
+
+const struct file_operations rxrpc_call_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rxrpc_call_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * generate a list of extant virtual connections in /proc/net/rxrpc_conns
+ */
+static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos)
+{
+ read_lock(&rxrpc_connection_lock);
+ return seq_list_start_head(&rxrpc_connections, *_pos);
+}
+
+static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos)
+{
+ return seq_list_next(v, &rxrpc_connections, pos);
+}
+
+static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&rxrpc_connection_lock);
+}
+
+static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
+{
+ struct rxrpc_connection *conn;
+ char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+
+ if (v == &rxrpc_connections) {
+ seq_puts(seq,
+ "Proto Local Remote "
+ " SvID ConnID End Use State Key "
+ " Serial ISerial\n"
+ );
+ return 0;
+ }
+
+ conn = list_entry(v, struct rxrpc_connection, link);
+
+ sprintf(lbuff, "%pI4:%u",
+ &conn->params.local->srx.transport.sin.sin_addr,
+ ntohs(conn->params.local->srx.transport.sin.sin_port));
+
+ sprintf(rbuff, "%pI4:%u",
+ &conn->params.peer->srx.transport.sin.sin_addr,
+ ntohs(conn->params.peer->srx.transport.sin.sin_port));
+
+ seq_printf(seq,
+ "UDP %-22.22s %-22.22s %4x %08x %s %3u"
+ " %s %08x %08x %08x\n",
+ lbuff,
+ rbuff,
+ conn->params.service_id,
+ conn->proto.cid,
+ rxrpc_conn_is_service(conn) ? "Svc" : "Clt",
+ atomic_read(&conn->usage),
+ rxrpc_conn_states[conn->state],
+ key_serial(conn->params.key),
+ atomic_read(&conn->serial),
+ atomic_read(&conn->hi_serial));
+
+ return 0;
+}
+
+static const struct seq_operations rxrpc_connection_seq_ops = {
+ .start = rxrpc_connection_seq_start,
+ .next = rxrpc_connection_seq_next,
+ .stop = rxrpc_connection_seq_stop,
+ .show = rxrpc_connection_seq_show,
+};
+
+
+static int rxrpc_connection_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rxrpc_connection_seq_ops);
+}
+
+const struct file_operations rxrpc_connection_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rxrpc_connection_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
new file mode 100644
index 000000000..9ed66d533
--- /dev/null
+++ b/net/rxrpc/recvmsg.c
@@ -0,0 +1,417 @@
+/* RxRPC recvmsg() implementation
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * removal a call's user ID from the socket tree to make the user ID available
+ * again and so that it won't be seen again in association with that call
+ */
+void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call)
+{
+ _debug("RELEASE CALL %d", call->debug_id);
+
+ if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ write_lock_bh(&rx->call_lock);
+ rb_erase(&call->sock_node, &call->socket->calls);
+ clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ write_unlock_bh(&rx->call_lock);
+ }
+
+ read_lock_bh(&call->state_lock);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * receive a message from an RxRPC socket
+ * - we need to be careful about two or more threads calling recvmsg
+ * simultaneously
+ */
+int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_call *call = NULL, *continue_call = NULL;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct sk_buff *skb;
+ long timeo;
+ int copy, ret, ullen, offset, copied = 0;
+ u32 abort_code;
+
+ DEFINE_WAIT(wait);
+
+ _enter(",,,%zu,%d", len, flags);
+
+ if (flags & (MSG_OOB | MSG_TRUNC))
+ return -EOPNOTSUPP;
+
+ ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long);
+
+ timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
+ msg->msg_flags |= MSG_MORE;
+
+ lock_sock(&rx->sk);
+
+ for (;;) {
+ /* return immediately if a client socket has no outstanding
+ * calls */
+ if (RB_EMPTY_ROOT(&rx->calls)) {
+ if (copied)
+ goto out;
+ if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
+ release_sock(&rx->sk);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ return -ENODATA;
+ }
+ }
+
+ /* get the next message on the Rx queue */
+ skb = skb_peek(&rx->sk.sk_receive_queue);
+ if (!skb) {
+ /* nothing remains on the queue */
+ if (copied &&
+ (flags & MSG_PEEK || timeo == 0))
+ goto out;
+
+ /* wait for a message to turn up */
+ release_sock(&rx->sk);
+ prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
+ TASK_INTERRUPTIBLE);
+ ret = sock_error(&rx->sk);
+ if (ret)
+ goto wait_error;
+
+ if (skb_queue_empty(&rx->sk.sk_receive_queue)) {
+ if (signal_pending(current))
+ goto wait_interrupted;
+ timeo = schedule_timeout(timeo);
+ }
+ finish_wait(sk_sleep(&rx->sk), &wait);
+ lock_sock(&rx->sk);
+ continue;
+ }
+
+ peek_next_packet:
+ sp = rxrpc_skb(skb);
+ call = sp->call;
+ ASSERT(call != NULL);
+
+ _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]);
+
+ /* make sure we wait for the state to be updated in this call */
+ spin_lock_bh(&call->lock);
+ spin_unlock_bh(&call->lock);
+
+ if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+ _debug("packet from released call");
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ continue;
+ }
+
+ /* determine whether to continue last data receive */
+ if (continue_call) {
+ _debug("maybe cont");
+ if (call != continue_call ||
+ skb->mark != RXRPC_SKB_MARK_DATA) {
+ release_sock(&rx->sk);
+ rxrpc_put_call(continue_call);
+ _leave(" = %d [noncont]", copied);
+ return copied;
+ }
+ }
+
+ rxrpc_get_call(call);
+
+ /* copy the peer address and timestamp */
+ if (!continue_call) {
+ if (msg->msg_name) {
+ size_t len =
+ sizeof(call->conn->params.peer->srx);
+ memcpy(msg->msg_name,
+ &call->conn->params.peer->srx, len);
+ msg->msg_namelen = len;
+ }
+ sock_recv_timestamp(msg, &rx->sk, skb);
+ }
+
+ /* receive the message */
+ if (skb->mark != RXRPC_SKB_MARK_DATA)
+ goto receive_non_data_message;
+
+ _debug("recvmsg DATA #%u { %d, %d }",
+ sp->hdr.seq, skb->len, sp->offset);
+
+ if (!continue_call) {
+ /* only set the control data once per recvmsg() */
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ ullen, &call->user_call_ID);
+ if (ret < 0)
+ goto copy_error;
+ ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+ }
+
+ ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+ ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+ call->rx_data_recv = sp->hdr.seq;
+
+ ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
+
+ offset = sp->offset;
+ copy = skb->len - offset;
+ if (copy > len - copied)
+ copy = len - copied;
+
+ ret = skb_copy_datagram_msg(skb, offset, msg, copy);
+
+ if (ret < 0)
+ goto copy_error;
+
+ /* handle piecemeal consumption of data packets */
+ _debug("copied %d+%d", copy, copied);
+
+ offset += copy;
+ copied += copy;
+
+ if (!(flags & MSG_PEEK))
+ sp->offset = offset;
+
+ if (sp->offset < skb->len) {
+ _debug("buffer full");
+ ASSERTCMP(copied, ==, len);
+ break;
+ }
+
+ /* we transferred the whole data packet */
+ if (!(flags & MSG_PEEK))
+ rxrpc_kernel_data_consumed(call, skb);
+
+ if (sp->hdr.flags & RXRPC_LAST_PACKET) {
+ _debug("last");
+ if (rxrpc_conn_is_client(call->conn)) {
+ /* last byte of reply received */
+ ret = copied;
+ goto terminal_message;
+ }
+
+ /* last bit of request received */
+ if (!(flags & MSG_PEEK)) {
+ _debug("eat packet");
+ if (skb_dequeue(&rx->sk.sk_receive_queue) !=
+ skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ }
+ msg->msg_flags &= ~MSG_MORE;
+ break;
+ }
+
+ /* move on to the next data message */
+ _debug("next");
+ if (!continue_call)
+ continue_call = sp->call;
+ else
+ rxrpc_put_call(call);
+ call = NULL;
+
+ if (flags & MSG_PEEK) {
+ _debug("peek next");
+ skb = skb->next;
+ if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue)
+ break;
+ goto peek_next_packet;
+ }
+
+ _debug("eat packet");
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ }
+
+ /* end of non-terminal data packet reception for the moment */
+ _debug("end rcv data");
+out:
+ release_sock(&rx->sk);
+ if (call)
+ rxrpc_put_call(call);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ _leave(" = %d [data]", copied);
+ return copied;
+
+ /* handle non-DATA messages such as aborts, incoming connections and
+ * final ACKs */
+receive_non_data_message:
+ _debug("non-data");
+
+ if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) {
+ _debug("RECV NEW CALL");
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code);
+ if (ret < 0)
+ goto copy_error;
+ if (!(flags & MSG_PEEK)) {
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ }
+ goto out;
+ }
+
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ ullen, &call->user_call_ID);
+ if (ret < 0)
+ goto copy_error;
+ ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+
+ switch (skb->mark) {
+ case RXRPC_SKB_MARK_DATA:
+ BUG();
+ case RXRPC_SKB_MARK_FINAL_ACK:
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_BUSY:
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_REMOTE_ABORT:
+ abort_code = call->remote_abort;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_LOCAL_ABORT:
+ abort_code = call->local_abort;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_NET_ERROR:
+ _debug("RECV NET ERROR %d", sp->error);
+ abort_code = sp->error;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_LOCAL_ERROR:
+ _debug("RECV LOCAL ERROR %d", sp->error);
+ abort_code = sp->error;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
+ &abort_code);
+ break;
+ default:
+ pr_err("Unknown packet mark %u\n", skb->mark);
+ BUG();
+ break;
+ }
+
+ if (ret < 0)
+ goto copy_error;
+
+terminal_message:
+ _debug("terminal");
+ msg->msg_flags &= ~MSG_MORE;
+ msg->msg_flags |= MSG_EOR;
+
+ if (!(flags & MSG_PEEK)) {
+ _net("free terminal skb %p", skb);
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ rxrpc_remove_user_ID(rx, call);
+ }
+
+ release_sock(&rx->sk);
+ rxrpc_put_call(call);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ _leave(" = %d", ret);
+ return ret;
+
+copy_error:
+ _debug("copy error");
+ release_sock(&rx->sk);
+ rxrpc_put_call(call);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ _leave(" = %d", ret);
+ return ret;
+
+wait_interrupted:
+ ret = sock_intr_errno(timeo);
+wait_error:
+ finish_wait(sk_sleep(&rx->sk), &wait);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ if (copied)
+ copied = ret;
+ _leave(" = %d [waitfail %d]", copied, ret);
+ return copied;
+
+}
+
+/**
+ * rxrpc_kernel_is_data_last - Determine if data message is last one
+ * @skb: Message holding data
+ *
+ * Determine if data message is last one for the parent call.
+ */
+bool rxrpc_kernel_is_data_last(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA);
+
+ return sp->hdr.flags & RXRPC_LAST_PACKET;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_is_data_last);
+
+/**
+ * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message
+ * @skb: Message indicating an abort
+ *
+ * Get the abort code from an RxRPC abort message.
+ */
+u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ switch (skb->mark) {
+ case RXRPC_SKB_MARK_REMOTE_ABORT:
+ return sp->call->remote_abort;
+ case RXRPC_SKB_MARK_LOCAL_ABORT:
+ return sp->call->local_abort;
+ default:
+ BUG();
+ }
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_get_abort_code);
+
+/**
+ * rxrpc_kernel_get_error - Get the error number from an RxRPC error message
+ * @skb: Message indicating an error
+ *
+ * Get the error number from an RxRPC error message.
+ */
+int rxrpc_kernel_get_error_number(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ return sp->error;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_get_error_number);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index bab56ed64..63afa9e9c 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -9,6 +9,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/net.h>
@@ -56,9 +58,9 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
struct rxrpc_key_token *token;
int ret;
- _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key));
+ _enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key));
- token = conn->key->payload.data[0];
+ token = conn->params.key->payload.data[0];
conn->security_ix = token->security_index;
ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
@@ -72,7 +74,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
sizeof(token->kad->session_key)) < 0)
BUG();
- switch (conn->security_level) {
+ switch (conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
break;
case RXRPC_SECURITY_AUTH:
@@ -101,43 +103,43 @@ error:
* prime the encryption state with the invariant parts of a connection's
* description
*/
-static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
+static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
{
struct rxrpc_key_token *token;
SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
- struct scatterlist sg[2];
+ struct scatterlist sg;
struct rxrpc_crypt iv;
- struct {
- __be32 x[4];
- } tmpbuf __attribute__((aligned(16))); /* must all be in same page */
+ __be32 *tmpbuf;
+ size_t tmpsize = 4 * sizeof(__be32);
_enter("");
- if (!conn->key)
- return;
+ if (!conn->params.key)
+ return 0;
- token = conn->key->payload.data[0];
- memcpy(&iv, token->kad->session_key, sizeof(iv));
+ tmpbuf = kmalloc(tmpsize, GFP_KERNEL);
+ if (!tmpbuf)
+ return -ENOMEM;
- tmpbuf.x[0] = htonl(conn->epoch);
- tmpbuf.x[1] = htonl(conn->cid);
- tmpbuf.x[2] = 0;
- tmpbuf.x[3] = htonl(conn->security_ix);
+ token = conn->params.key->payload.data[0];
+ memcpy(&iv, token->kad->session_key, sizeof(iv));
- sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
- sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+ tmpbuf[0] = htonl(conn->proto.epoch);
+ tmpbuf[1] = htonl(conn->proto.cid);
+ tmpbuf[2] = 0;
+ tmpbuf[3] = htonl(conn->security_ix);
+ sg_init_one(&sg, tmpbuf, tmpsize);
skcipher_request_set_tfm(req, conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
-
+ skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x);
crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
- memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv));
- ASSERTCMP((u32 __force)conn->csum_iv.n[0], ==, (u32 __force)tmpbuf.x[2]);
-
- _leave("");
+ memcpy(&conn->csum_iv, tmpbuf + 2, sizeof(conn->csum_iv));
+ kfree(tmpbuf);
+ _leave(" = 0");
+ return 0;
}
/*
@@ -150,12 +152,9 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
{
struct rxrpc_skb_priv *sp;
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
+ struct rxkad_level1_hdr hdr;
struct rxrpc_crypt iv;
- struct scatterlist sg[2];
- struct {
- struct rxkad_level1_hdr hdr;
- __be32 first; /* first four bytes of data and padding */
- } tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+ struct scatterlist sg;
u16 check;
sp = rxrpc_skb(skb);
@@ -165,24 +164,19 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
check = sp->hdr.seq ^ sp->hdr.callNumber;
data_size |= (u32)check << 16;
- tmpbuf.hdr.data_size = htonl(data_size);
- memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first));
+ hdr.data_size = htonl(data_size);
+ memcpy(sechdr, &hdr, sizeof(hdr));
/* start the encryption afresh */
memset(&iv, 0, sizeof(iv));
- sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
- sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
-
+ sg_init_one(&sg, sechdr, 8);
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
-
+ skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
- memcpy(sechdr, &tmpbuf, sizeof(tmpbuf));
-
_leave(" = 0");
return 0;
}
@@ -196,8 +190,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
void *sechdr)
{
const struct rxrpc_key_token *token;
- struct rxkad_level2_hdr rxkhdr
- __attribute__((aligned(8))); /* must be all on one page */
+ struct rxkad_level2_hdr rxkhdr;
struct rxrpc_skb_priv *sp;
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
@@ -216,18 +209,16 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
rxkhdr.data_size = htonl(data_size | (u32)check << 16);
rxkhdr.checksum = 0;
+ memcpy(sechdr, &rxkhdr, sizeof(rxkhdr));
/* encrypt from the session key */
- token = call->conn->key->payload.data[0];
+ token = call->conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
sg_init_one(&sg[0], sechdr, sizeof(rxkhdr));
- sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr));
-
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(rxkhdr), iv.x);
-
+ skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x);
crypto_skcipher_encrypt(req);
/* we want to encrypt the skbuff in-place */
@@ -241,9 +232,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
sg_init_table(sg, nsg);
skb_to_sgvec(skb, sg, 0, len);
-
skcipher_request_set_crypt(req, sg, sg, len, iv.x);
-
crypto_skcipher_encrypt(req);
_leave(" = 0");
@@ -257,7 +246,7 @@ out:
/*
* checksum an RxRPC packet header
*/
-static int rxkad_secure_packet(const struct rxrpc_call *call,
+static int rxkad_secure_packet(struct rxrpc_call *call,
struct sk_buff *skb,
size_t data_size,
void *sechdr)
@@ -265,23 +254,20 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
struct rxrpc_skb_priv *sp;
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
- struct scatterlist sg[2];
- struct {
- __be32 x[2];
- } tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+ struct scatterlist sg;
u32 x, y;
int ret;
sp = rxrpc_skb(skb);
_enter("{%d{%x}},{#%u},%zu,",
- call->debug_id, key_serial(call->conn->key), sp->hdr.seq,
- data_size);
+ call->debug_id, key_serial(call->conn->params.key),
+ sp->hdr.seq, data_size);
if (!call->conn->cipher)
return 0;
- ret = key_validate(call->conn->key);
+ ret = key_validate(call->conn->params.key);
if (ret < 0)
return ret;
@@ -291,26 +277,23 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
/* calculate the security checksum */
x = call->channel << (32 - RXRPC_CIDSHIFT);
x |= sp->hdr.seq & 0x3fffffff;
- tmpbuf.x[0] = htonl(sp->hdr.callNumber);
- tmpbuf.x[1] = htonl(x);
-
- sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
- sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+ call->crypto_buf[0] = htonl(sp->hdr.callNumber);
+ call->crypto_buf[1] = htonl(x);
+ sg_init_one(&sg, call->crypto_buf, 8);
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
-
+ skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
- y = ntohl(tmpbuf.x[1]);
+ y = ntohl(call->crypto_buf[1]);
y = (y >> 16) & 0xffff;
if (y == 0)
y = 1; /* zero checksums are not permitted */
sp->hdr.cksum = y;
- switch (call->conn->security_level) {
+ switch (call->conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
ret = 0;
break;
@@ -365,7 +348,6 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, 8, iv.x);
-
crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
@@ -444,13 +426,12 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
skb_to_sgvec(skb, sg, 0, skb->len);
/* decrypt from the session key */
- token = call->conn->key->payload.data[0];
+ token = call->conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x);
-
crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
if (sg != _sg)
@@ -496,17 +477,14 @@ nomem:
/*
* verify the security on a received packet
*/
-static int rxkad_verify_packet(const struct rxrpc_call *call,
+static int rxkad_verify_packet(struct rxrpc_call *call,
struct sk_buff *skb,
u32 *_abort_code)
{
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_skb_priv *sp;
struct rxrpc_crypt iv;
- struct scatterlist sg[2];
- struct {
- __be32 x[2];
- } tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+ struct scatterlist sg;
u16 cksum;
u32 x, y;
int ret;
@@ -514,7 +492,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
sp = rxrpc_skb(skb);
_enter("{%d{%x}},{#%u}",
- call->debug_id, key_serial(call->conn->key), sp->hdr.seq);
+ call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq);
if (!call->conn->cipher)
return 0;
@@ -531,20 +509,17 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
/* validate the security checksum */
x = call->channel << (32 - RXRPC_CIDSHIFT);
x |= sp->hdr.seq & 0x3fffffff;
- tmpbuf.x[0] = htonl(call->call_id);
- tmpbuf.x[1] = htonl(x);
-
- sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
- sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+ call->crypto_buf[0] = htonl(call->call_id);
+ call->crypto_buf[1] = htonl(x);
+ sg_init_one(&sg, call->crypto_buf, 8);
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
-
+ skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
- y = ntohl(tmpbuf.x[1]);
+ y = ntohl(call->crypto_buf[1]);
cksum = (y >> 16) & 0xffff;
if (cksum == 0)
cksum = 1; /* zero checksums are not permitted */
@@ -555,7 +530,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
return -EPROTO;
}
- switch (call->conn->security_level) {
+ switch (call->conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
ret = 0;
break;
@@ -587,9 +562,9 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
u32 serial;
int ret;
- _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key));
- ret = key_validate(conn->key);
+ ret = key_validate(conn->params.key);
if (ret < 0)
return ret;
@@ -600,14 +575,14 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
challenge.min_level = htonl(0);
challenge.__padding = 0;
- msg.msg_name = &conn->trans->peer->srx.transport.sin;
- msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin);
+ msg.msg_name = &conn->params.peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(conn->params.peer->srx.transport.sin);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
- whdr.epoch = htonl(conn->epoch);
- whdr.cid = htonl(conn->cid);
+ whdr.epoch = htonl(conn->proto.epoch);
+ whdr.cid = htonl(conn->proto.cid);
whdr.callNumber = 0;
whdr.seq = 0;
whdr.type = RXRPC_PACKET_TYPE_CHALLENGE;
@@ -615,7 +590,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
whdr.userStatus = 0;
whdr.securityIndex = conn->security_ix;
whdr._rsvd = 0;
- whdr.serviceId = htons(conn->service_id);
+ whdr.serviceId = htons(conn->params.service_id);
iov[0].iov_base = &whdr;
iov[0].iov_len = sizeof(whdr);
@@ -628,7 +603,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
whdr.serial = htonl(serial);
_proto("Tx CHALLENGE %%%u", serial);
- ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
if (ret < 0) {
_debug("sendmsg failed: %d", ret);
return -EAGAIN;
@@ -655,8 +630,8 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
_enter("");
- msg.msg_name = &conn->trans->peer->srx.transport.sin;
- msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin);
+ msg.msg_name = &conn->params.peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(conn->params.peer->srx.transport.sin);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -682,7 +657,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
whdr.serial = htonl(serial);
_proto("Tx RESPONSE %%%u", serial);
- ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len);
if (ret < 0) {
_debug("sendmsg failed: %d", ret);
return -EAGAIN;
@@ -708,29 +683,6 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response)
}
/*
- * load a scatterlist with a potentially split-page buffer
- */
-static void rxkad_sg_set_buf2(struct scatterlist sg[2],
- void *buf, size_t buflen)
-{
- int nsg = 1;
-
- sg_init_table(sg, 2);
-
- sg_set_buf(&sg[0], buf, buflen);
- if (sg[0].offset + buflen > PAGE_SIZE) {
- /* the buffer was split over two pages */
- sg[0].length = PAGE_SIZE - sg[0].offset;
- sg_set_buf(&sg[1], buf + sg[0].length, buflen - sg[0].length);
- nsg++;
- }
-
- sg_mark_end(&sg[nsg - 1]);
-
- ASSERTCMP(sg[0].length + sg[1].length, ==, buflen);
-}
-
-/*
* encrypt the response packet
*/
static void rxkad_encrypt_response(struct rxrpc_connection *conn,
@@ -739,17 +691,16 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn,
{
SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
struct rxrpc_crypt iv;
- struct scatterlist sg[2];
+ struct scatterlist sg[1];
/* continue encrypting from where we left off */
memcpy(&iv, s2->session_key, sizeof(iv));
- rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
-
+ sg_init_table(sg, 1);
+ sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted));
skcipher_request_set_tfm(req, conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
-
crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
}
@@ -769,14 +720,14 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
u32 version, nonce, min_level, abort_code;
int ret;
- _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key));
- if (!conn->key) {
+ if (!conn->params.key) {
_leave(" = -EPROTO [no key]");
return -EPROTO;
}
- ret = key_validate(conn->key);
+ ret = key_validate(conn->params.key);
if (ret < 0) {
*_abort_code = RXKADEXPIRED;
return ret;
@@ -799,31 +750,27 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
goto protocol_error;
abort_code = RXKADLEVELFAIL;
- if (conn->security_level < min_level)
+ if (conn->params.security_level < min_level)
goto protocol_error;
- token = conn->key->payload.data[0];
+ token = conn->params.key->payload.data[0];
/* build the response packet */
memset(&resp, 0, sizeof(resp));
resp.version = htonl(RXKAD_VERSION);
- resp.encrypted.epoch = htonl(conn->epoch);
- resp.encrypted.cid = htonl(conn->cid);
+ resp.encrypted.epoch = htonl(conn->proto.epoch);
+ resp.encrypted.cid = htonl(conn->proto.cid);
resp.encrypted.securityIndex = htonl(conn->security_ix);
resp.encrypted.inc_nonce = htonl(nonce + 1);
- resp.encrypted.level = htonl(conn->security_level);
+ resp.encrypted.level = htonl(conn->params.security_level);
resp.kvno = htonl(token->kad->kvno);
resp.ticket_len = htonl(token->kad->ticket_len);
- resp.encrypted.call_id[0] =
- htonl(conn->channels[0] ? conn->channels[0]->call_id : 0);
- resp.encrypted.call_id[1] =
- htonl(conn->channels[1] ? conn->channels[1]->call_id : 0);
- resp.encrypted.call_id[2] =
- htonl(conn->channels[2] ? conn->channels[2]->call_id : 0);
- resp.encrypted.call_id[3] =
- htonl(conn->channels[3] ? conn->channels[3]->call_id : 0);
+ resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter);
+ resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter);
+ resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter);
+ resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter);
/* calculate the response checksum and then do the encryption */
rxkad_calc_response_checksum(&resp);
@@ -885,10 +832,8 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
}
sg_init_one(&sg[0], ticket, ticket_len);
-
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, ticket_len, iv.x);
-
crypto_skcipher_decrypt(req);
skcipher_request_free(req);
@@ -999,7 +944,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
const struct rxrpc_crypt *session_key)
{
SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci);
- struct scatterlist sg[2];
+ struct scatterlist sg[1];
struct rxrpc_crypt iv;
_enter(",,%08x%08x",
@@ -1014,12 +959,11 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
memcpy(&iv, session_key, sizeof(iv));
- rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
-
+ sg_init_table(sg, 1);
+ sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted));
skcipher_request_set_tfm(req, rxkad_ci);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
-
crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
@@ -1043,7 +987,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
void *ticket;
u32 abort_code, version, kvno, ticket_len, level;
__be32 csum;
- int ret;
+ int ret, i;
_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
@@ -1094,9 +1038,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
rxkad_decrypt_response(conn, &response, &session_key);
abort_code = RXKADSEALEDINCON;
- if (ntohl(response.encrypted.epoch) != conn->epoch)
+ if (ntohl(response.encrypted.epoch) != conn->proto.epoch)
goto protocol_error_free;
- if (ntohl(response.encrypted.cid) != conn->cid)
+ if (ntohl(response.encrypted.cid) != conn->proto.cid)
goto protocol_error_free;
if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
goto protocol_error_free;
@@ -1106,11 +1050,26 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
if (response.encrypted.checksum != csum)
goto protocol_error_free;
- if (ntohl(response.encrypted.call_id[0]) > INT_MAX ||
- ntohl(response.encrypted.call_id[1]) > INT_MAX ||
- ntohl(response.encrypted.call_id[2]) > INT_MAX ||
- ntohl(response.encrypted.call_id[3]) > INT_MAX)
- goto protocol_error_free;
+ spin_lock(&conn->channel_lock);
+ for (i = 0; i < RXRPC_MAXCALLS; i++) {
+ struct rxrpc_call *call;
+ u32 call_id = ntohl(response.encrypted.call_id[i]);
+
+ if (call_id > INT_MAX)
+ goto protocol_error_unlock;
+
+ if (call_id < conn->channels[i].call_counter)
+ goto protocol_error_unlock;
+ if (call_id > conn->channels[i].call_counter) {
+ call = rcu_dereference_protected(
+ conn->channels[i].call,
+ lockdep_is_held(&conn->channel_lock));
+ if (call && call->state < RXRPC_CALL_COMPLETE)
+ goto protocol_error_unlock;
+ conn->channels[i].call_counter = call_id;
+ }
+ }
+ spin_unlock(&conn->channel_lock);
abort_code = RXKADOUTOFSEQUENCE;
if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1)
@@ -1120,7 +1079,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
level = ntohl(response.encrypted.level);
if (level > RXRPC_SECURITY_ENCRYPT)
goto protocol_error_free;
- conn->security_level = level;
+ conn->params.security_level = level;
/* create a key to hold the security data and expiration time - after
* this the connection security can be handled in exactly the same way
@@ -1135,6 +1094,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
_leave(" = 0");
return 0;
+protocol_error_unlock:
+ spin_unlock(&conn->channel_lock);
protocol_error_free:
kfree(ticket);
protocol_error:
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
new file mode 100644
index 000000000..814d285ff
--- /dev/null
+++ b/net/rxrpc/security.c
@@ -0,0 +1,168 @@
+/* RxRPC security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/crypto.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#include "ar-internal.h"
+
+static LIST_HEAD(rxrpc_security_methods);
+static DECLARE_RWSEM(rxrpc_security_sem);
+
+static const struct rxrpc_security *rxrpc_security_types[] = {
+ [RXRPC_SECURITY_NONE] = &rxrpc_no_security,
+#ifdef CONFIG_RXKAD
+ [RXRPC_SECURITY_RXKAD] = &rxkad,
+#endif
+};
+
+int __init rxrpc_init_security(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) {
+ if (rxrpc_security_types[i]) {
+ ret = rxrpc_security_types[i]->init();
+ if (ret < 0)
+ goto failed;
+ }
+ }
+
+ return 0;
+
+failed:
+ for (i--; i >= 0; i--)
+ if (rxrpc_security_types[i])
+ rxrpc_security_types[i]->exit();
+ return ret;
+}
+
+void rxrpc_exit_security(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++)
+ if (rxrpc_security_types[i])
+ rxrpc_security_types[i]->exit();
+}
+
+/*
+ * look up an rxrpc security module
+ */
+static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index)
+{
+ if (security_index >= ARRAY_SIZE(rxrpc_security_types))
+ return NULL;
+ return rxrpc_security_types[security_index];
+}
+
+/*
+ * initialise the security on a client connection
+ */
+int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
+{
+ const struct rxrpc_security *sec;
+ struct rxrpc_key_token *token;
+ struct key *key = conn->params.key;
+ int ret;
+
+ _enter("{%d},{%x}", conn->debug_id, key_serial(key));
+
+ if (!key)
+ return 0;
+
+ ret = key_validate(key);
+ if (ret < 0)
+ return ret;
+
+ token = key->payload.data[0];
+ if (!token)
+ return -EKEYREJECTED;
+
+ sec = rxrpc_security_lookup(token->security_index);
+ if (!sec)
+ return -EKEYREJECTED;
+ conn->security = sec;
+
+ ret = conn->security->init_connection_security(conn);
+ if (ret < 0) {
+ conn->security = &rxrpc_no_security;
+ return ret;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * initialise the security on a server connection
+ */
+int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
+{
+ const struct rxrpc_security *sec;
+ struct rxrpc_local *local = conn->params.local;
+ struct rxrpc_sock *rx;
+ struct key *key;
+ key_ref_t kref;
+ char kdesc[5 + 1 + 3 + 1];
+
+ _enter("");
+
+ sprintf(kdesc, "%u:%u", conn->params.service_id, conn->security_ix);
+
+ sec = rxrpc_security_lookup(conn->security_ix);
+ if (!sec) {
+ _leave(" = -ENOKEY [lookup]");
+ return -ENOKEY;
+ }
+
+ /* find the service */
+ read_lock_bh(&local->services_lock);
+ list_for_each_entry(rx, &local->services, listen_link) {
+ if (rx->srx.srx_service == conn->params.service_id)
+ goto found_service;
+ }
+
+ /* the service appears to have died */
+ read_unlock_bh(&local->services_lock);
+ _leave(" = -ENOENT");
+ return -ENOENT;
+
+found_service:
+ if (!rx->securities) {
+ read_unlock_bh(&local->services_lock);
+ _leave(" = -ENOKEY");
+ return -ENOKEY;
+ }
+
+ /* look through the service's keyring */
+ kref = keyring_search(make_key_ref(rx->securities, 1UL),
+ &key_type_rxrpc_s, kdesc);
+ if (IS_ERR(kref)) {
+ read_unlock_bh(&local->services_lock);
+ _leave(" = %ld [search]", PTR_ERR(kref));
+ return PTR_ERR(kref);
+ }
+
+ key = key_ref_to_ptr(kref);
+ read_unlock_bh(&local->services_lock);
+
+ conn->server_key = key;
+ conn->security = sec;
+
+ _leave(" = 0");
+ return 0;
+}
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
new file mode 100644
index 000000000..06c51d4b6
--- /dev/null
+++ b/net/rxrpc/skbuff.c
@@ -0,0 +1,165 @@
+/* ar-skbuff.c: socket buffer destruction handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * set up for the ACK at the end of the receive phase when we discard the final
+ * receive phase data packet
+ * - called with softirqs disabled
+ */
+static void rxrpc_request_final_ACK(struct rxrpc_call *call)
+{
+ /* the call may be aborted before we have a chance to ACK it */
+ write_lock(&call->state_lock);
+
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
+ _debug("request final ACK");
+
+ /* get an extra ref on the call for the final-ACK generator to
+ * release */
+ rxrpc_get_call(call);
+ set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
+ if (try_to_del_timer_sync(&call->ack_timer) >= 0)
+ rxrpc_queue_call(call);
+ break;
+
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
+ default:
+ break;
+ }
+
+ write_unlock(&call->state_lock);
+}
+
+/*
+ * drop the bottom ACK off of the call ACK window and advance the window
+ */
+static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
+ struct rxrpc_skb_priv *sp)
+{
+ int loop;
+ u32 seq;
+
+ spin_lock_bh(&call->lock);
+
+ _debug("hard ACK #%u", sp->hdr.seq);
+
+ for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
+ call->ackr_window[loop] >>= 1;
+ call->ackr_window[loop] |=
+ call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
+ }
+
+ seq = sp->hdr.seq;
+ ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
+ call->rx_data_eaten = seq;
+
+ if (call->ackr_win_top < UINT_MAX)
+ call->ackr_win_top++;
+
+ ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
+ call->rx_data_post, >=, call->rx_data_recv);
+ ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
+ call->rx_data_recv, >=, call->rx_data_eaten);
+
+ if (sp->hdr.flags & RXRPC_LAST_PACKET) {
+ rxrpc_request_final_ACK(call);
+ } else if (atomic_dec_and_test(&call->ackr_not_idle) &&
+ test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) {
+ /* We previously soft-ACK'd some received packets that have now
+ * been consumed, so send a hard-ACK if no more packets are
+ * immediately forthcoming to allow the transmitter to free up
+ * its Tx bufferage.
+ */
+ _debug("send Rx idle ACK");
+ __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial,
+ false);
+ }
+
+ spin_unlock_bh(&call->lock);
+}
+
+/**
+ * rxrpc_kernel_data_consumed - Record consumption of data message
+ * @call: The call to which the message pertains.
+ * @skb: Message holding data
+ *
+ * Record the consumption of a data message and generate an ACK if appropriate.
+ * The call state is shifted if this was the final packet. The caller must be
+ * in process context with no spinlocks held.
+ *
+ * TODO: Actually generate the ACK here rather than punting this to the
+ * workqueue.
+ */
+void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq);
+
+ ASSERTCMP(sp->call, ==, call);
+ ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA);
+
+ /* TODO: Fix the sequence number tracking */
+ ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+ ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+ ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
+
+ call->rx_data_recv = sp->hdr.seq;
+ rxrpc_hard_ACK_data(call, sp);
+}
+EXPORT_SYMBOL(rxrpc_kernel_data_consumed);
+
+/*
+ * Destroy a packet that has an RxRPC control buffer
+ */
+void rxrpc_packet_destructor(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_call *call = sp->call;
+
+ _enter("%p{%p}", skb, call);
+
+ if (call) {
+ if (atomic_dec_return(&call->skb_count) < 0)
+ BUG();
+ rxrpc_put_call(call);
+ sp->call = NULL;
+ }
+
+ if (skb->sk)
+ sock_rfree(skb);
+ _leave("");
+}
+
+/**
+ * rxrpc_kernel_free_skb - Free an RxRPC socket buffer
+ * @skb: The socket buffer to be freed
+ *
+ * Let RxRPC free its own socket buffer, permitting it to maintain debug
+ * accounting.
+ */
+void rxrpc_kernel_free_skb(struct sk_buff *skb)
+{
+ rxrpc_free_skb(skb);
+}
+EXPORT_SYMBOL(rxrpc_kernel_free_skb);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index d20ed575a..03ad08774 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -18,6 +18,7 @@ static struct ctl_table_header *rxrpc_sysctl_reg_table;
static const unsigned int zero = 0;
static const unsigned int one = 1;
static const unsigned int four = 4;
+static const unsigned int thirtytwo = 32;
static const unsigned int n_65535 = 65535;
static const unsigned int n_max_acks = RXRPC_MAXACKS;
@@ -89,16 +90,17 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)&one,
},
+
+ /* Non-time values */
{
- .procname = "transport_expiry",
- .data = &rxrpc_transport_expiry,
+ .procname = "max_backlog",
+ .data = &rxrpc_max_backlog,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)&one,
+ .extra1 = (void *)&four,
+ .extra2 = (void *)&thirtytwo,
},
-
- /* Non-time values */
{
.procname = "rx_window_size",
.data = &rxrpc_rx_window_size,
diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c
new file mode 100644
index 000000000..b88914d53
--- /dev/null
+++ b/net/rxrpc/utils.c
@@ -0,0 +1,46 @@
+/* Utility routines
+ *
+ * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include "ar-internal.h"
+
+/*
+ * Fill out a peer address from a socket buffer containing a packet.
+ */
+int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
+{
+ memset(srx, 0, sizeof(*srx));
+
+ switch (ntohs(skb->protocol)) {
+ case ETH_P_IP:
+ srx->transport_type = SOCK_DGRAM;
+ srx->transport_len = sizeof(srx->transport.sin);
+ srx->transport.sin.sin_family = AF_INET;
+ srx->transport.sin.sin_port = udp_hdr(skb)->source;
+ srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+ return 0;
+
+ case ETH_P_IPV6:
+ srx->transport_type = SOCK_DGRAM;
+ srx->transport_len = sizeof(srx->transport.sin6);
+ srx->transport.sin6.sin6_family = AF_INET6;
+ srx->transport.sin6.sin6_port = udp_hdr(skb)->source;
+ srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
+ return 0;
+
+ default:
+ pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n",
+ ntohs(skb->protocol));
+ return -EAFNOSUPPORT;
+ }
+}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b148302bb..ccf931b3b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -494,6 +494,16 @@ config NET_CLS_FLOWER
To compile this code as a module, choose M here: the module will
be called cls_flower.
+config NET_CLS_MATCHALL
+ tristate "Match-all classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ nothing. Every packet will match.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_matchall.
+
config NET_EMATCH
bool "Extended Matches"
select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 84bddb373..ae088a5a9 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o
obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o
+obj-$(CONFIG_NET_CLS_MATCHALL) += cls_matchall.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index c7a0b0d48..d09d06875 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -29,45 +29,42 @@
static void free_tcf(struct rcu_head *head)
{
- struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu);
+ struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu);
free_percpu(p->cpu_bstats);
free_percpu(p->cpu_qstats);
kfree(p);
}
-static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a)
+static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
{
- struct tcf_common *p = a->priv;
-
spin_lock_bh(&hinfo->lock);
- hlist_del(&p->tcfc_head);
+ hlist_del(&p->tcfa_head);
spin_unlock_bh(&hinfo->lock);
- gen_kill_estimator(&p->tcfc_bstats,
- &p->tcfc_rate_est);
+ gen_kill_estimator(&p->tcfa_bstats,
+ &p->tcfa_rate_est);
/*
- * gen_estimator est_timer() might access p->tcfc_lock
+ * gen_estimator est_timer() might access p->tcfa_lock
* or bstats, wait a RCU grace period before freeing p
*/
- call_rcu(&p->tcfc_rcu, free_tcf);
+ call_rcu(&p->tcfa_rcu, free_tcf);
}
-int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
+int __tcf_hash_release(struct tc_action *p, bool bind, bool strict)
{
- struct tcf_common *p = a->priv;
int ret = 0;
if (p) {
if (bind)
- p->tcfc_bindcnt--;
- else if (strict && p->tcfc_bindcnt > 0)
+ p->tcfa_bindcnt--;
+ else if (strict && p->tcfa_bindcnt > 0)
return -EPERM;
- p->tcfc_refcnt--;
- if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
- if (a->ops->cleanup)
- a->ops->cleanup(a, bind);
- tcf_hash_destroy(a->hinfo, a);
+ p->tcfa_refcnt--;
+ if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
+ if (p->ops->cleanup)
+ p->ops->cleanup(p, bind);
+ tcf_hash_destroy(p->hinfo, p);
ret = ACT_P_DELETED;
}
}
@@ -77,10 +74,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
EXPORT_SYMBOL(__tcf_hash_release);
static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
- struct netlink_callback *cb, struct tc_action *a)
+ struct netlink_callback *cb)
{
- struct hlist_head *head;
- struct tcf_common *p;
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
struct nlattr *nest;
@@ -89,19 +84,20 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
s_i = cb->args[0];
for (i = 0; i < (hinfo->hmask + 1); i++) {
+ struct hlist_head *head;
+ struct tc_action *p;
+
head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
- hlist_for_each_entry_rcu(p, head, tcfc_head) {
+ hlist_for_each_entry_rcu(p, head, tcfa_head) {
index++;
if (index < s_i)
continue;
- a->priv = p;
- a->order = n_i;
- nest = nla_nest_start(skb, a->order);
+ nest = nla_nest_start(skb, n_i);
if (nest == NULL)
goto nla_put_failure;
- err = tcf_action_dump_1(skb, a, 0, 0);
+ err = tcf_action_dump_1(skb, p, 0, 0);
if (err < 0) {
index--;
nlmsg_trim(skb, nest);
@@ -125,27 +121,27 @@ nla_put_failure:
}
static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
- struct hlist_head *head;
- struct hlist_node *n;
- struct tcf_common *p;
struct nlattr *nest;
int i = 0, n_i = 0;
int ret = -EINVAL;
- nest = nla_nest_start(skb, a->order);
+ nest = nla_nest_start(skb, 0);
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+ if (nla_put_string(skb, TCA_KIND, ops->kind))
goto nla_put_failure;
for (i = 0; i < (hinfo->hmask + 1); i++) {
+ struct hlist_head *head;
+ struct hlist_node *n;
+ struct tc_action *p;
+
head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
- hlist_for_each_entry_safe(p, n, head, tcfc_head) {
- a->priv = p;
- ret = __tcf_hash_release(a, false, true);
+ hlist_for_each_entry_safe(p, n, head, tcfa_head) {
+ ret = __tcf_hash_release(p, false, true);
if (ret == ACT_P_DELETED) {
- module_put(a->ops->owner);
+ module_put(p->ops->owner);
n_i++;
} else if (ret < 0)
goto nla_put_failure;
@@ -163,16 +159,14 @@ nla_put_failure:
int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tcf_hashinfo *hinfo = tn->hinfo;
- a->hinfo = hinfo;
-
if (type == RTM_DELACTION) {
- return tcf_del_walker(hinfo, skb, a);
+ return tcf_del_walker(hinfo, skb, ops);
} else if (type == RTM_GETACTION) {
- return tcf_dump_walker(hinfo, skb, cb, a);
+ return tcf_dump_walker(hinfo, skb, cb);
} else {
WARN(1, "tcf_generic_walker: unknown action %d\n", type);
return -EINVAL;
@@ -180,15 +174,15 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
}
EXPORT_SYMBOL(tcf_generic_walker);
-static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+static struct tc_action *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
{
- struct tcf_common *p = NULL;
+ struct tc_action *p = NULL;
struct hlist_head *head;
spin_lock_bh(&hinfo->lock);
head = &hinfo->htab[tcf_hash(index, hinfo->hmask)];
- hlist_for_each_entry_rcu(p, head, tcfc_head)
- if (p->tcfc_index == index)
+ hlist_for_each_entry_rcu(p, head, tcfa_head)
+ if (p->tcfa_index == index)
break;
spin_unlock_bh(&hinfo->lock);
@@ -210,59 +204,58 @@ u32 tcf_hash_new_index(struct tc_action_net *tn)
}
EXPORT_SYMBOL(tcf_hash_new_index);
-int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index)
+int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
{
struct tcf_hashinfo *hinfo = tn->hinfo;
- struct tcf_common *p = tcf_hash_lookup(index, hinfo);
+ struct tc_action *p = tcf_hash_lookup(index, hinfo);
if (p) {
- a->priv = p;
- a->hinfo = hinfo;
+ *a = p;
return 1;
}
return 0;
}
EXPORT_SYMBOL(tcf_hash_search);
-int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
- int bind)
+bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
+ int bind)
{
struct tcf_hashinfo *hinfo = tn->hinfo;
- struct tcf_common *p = NULL;
+ struct tc_action *p = NULL;
+
if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
if (bind)
- p->tcfc_bindcnt++;
- p->tcfc_refcnt++;
- a->priv = p;
- a->hinfo = hinfo;
- return 1;
+ p->tcfa_bindcnt++;
+ p->tcfa_refcnt++;
+ *a = p;
+ return true;
}
- return 0;
+ return false;
}
EXPORT_SYMBOL(tcf_hash_check);
void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
{
- struct tcf_common *pc = a->priv;
if (est)
- gen_kill_estimator(&pc->tcfc_bstats,
- &pc->tcfc_rate_est);
- call_rcu(&pc->tcfc_rcu, free_tcf);
+ gen_kill_estimator(&a->tcfa_bstats,
+ &a->tcfa_rate_est);
+ call_rcu(&a->tcfa_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
- struct tc_action *a, int size, int bind, bool cpustats)
+ struct tc_action **a, const struct tc_action_ops *ops,
+ int bind, bool cpustats)
{
- struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+ struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
struct tcf_hashinfo *hinfo = tn->hinfo;
int err = -ENOMEM;
if (unlikely(!p))
return -ENOMEM;
- p->tcfc_refcnt = 1;
+ p->tcfa_refcnt = 1;
if (bind)
- p->tcfc_bindcnt = 1;
+ p->tcfa_bindcnt = 1;
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
@@ -278,35 +271,37 @@ err2:
goto err1;
}
}
- spin_lock_init(&p->tcfc_lock);
- INIT_HLIST_NODE(&p->tcfc_head);
- p->tcfc_index = index ? index : tcf_hash_new_index(tn);
- p->tcfc_tm.install = jiffies;
- p->tcfc_tm.lastuse = jiffies;
+ spin_lock_init(&p->tcfa_lock);
+ INIT_HLIST_NODE(&p->tcfa_head);
+ p->tcfa_index = index ? index : tcf_hash_new_index(tn);
+ p->tcfa_tm.install = jiffies;
+ p->tcfa_tm.lastuse = jiffies;
+ p->tcfa_tm.firstuse = 0;
if (est) {
- err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
- &p->tcfc_rate_est,
- &p->tcfc_lock, est);
+ err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
+ &p->tcfa_rate_est,
+ &p->tcfa_lock, NULL, est);
if (err) {
free_percpu(p->cpu_qstats);
goto err2;
}
}
- a->priv = (void *) p;
- a->hinfo = hinfo;
+ p->hinfo = hinfo;
+ p->ops = ops;
+ INIT_LIST_HEAD(&p->list);
+ *a = p;
return 0;
}
EXPORT_SYMBOL(tcf_hash_create);
void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a)
{
- struct tcf_common *p = a->priv;
struct tcf_hashinfo *hinfo = tn->hinfo;
- unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
+ unsigned int h = tcf_hash(a->tcfa_index, hinfo->hmask);
spin_lock_bh(&hinfo->lock);
- hlist_add_head(&p->tcfc_head, &hinfo->htab[h]);
+ hlist_add_head(&a->tcfa_head, &hinfo->htab[h]);
spin_unlock_bh(&hinfo->lock);
}
EXPORT_SYMBOL(tcf_hash_insert);
@@ -314,21 +309,16 @@ EXPORT_SYMBOL(tcf_hash_insert);
void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
struct tcf_hashinfo *hinfo)
{
- struct tc_action a = {
- .ops = ops,
- .hinfo = hinfo,
- };
int i;
for (i = 0; i < hinfo->hmask + 1; i++) {
- struct tcf_common *p;
+ struct tc_action *p;
struct hlist_node *n;
- hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) {
+ hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfa_head) {
int ret;
- a.priv = p;
- ret = __tcf_hash_release(&a, false, true);
+ ret = __tcf_hash_release(p, false, true);
if (ret == ACT_P_DELETED)
module_put(ops->owner);
else if (ret < 0)
@@ -430,18 +420,19 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
return res;
}
-int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
- struct tcf_result *res)
+int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
+ int nr_actions, struct tcf_result *res)
{
- const struct tc_action *a;
- int ret = -1;
+ int ret = -1, i;
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
ret = TC_ACT_OK;
goto exec_done;
}
- list_for_each_entry(a, actions, list) {
+ for (i = 0; i < nr_actions; i++) {
+ const struct tc_action *a = actions[i];
+
repeat:
ret = a->ops->act(skb, a, res);
if (ret == TC_ACT_REPEAT)
@@ -465,8 +456,6 @@ int tcf_action_destroy(struct list_head *actions, int bind)
module_put(a->ops->owner);
else if (ret < 0)
return ret;
- list_del(&a->list);
- kfree(a);
}
return ret;
}
@@ -503,8 +492,8 @@ nla_put_failure:
}
EXPORT_SYMBOL(tcf_action_dump_1);
-int
-tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref)
+int tcf_action_dump(struct sk_buff *skb, struct list_head *actions,
+ int bind, int ref)
{
struct tc_action *a;
int err = -EINVAL;
@@ -580,20 +569,13 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
goto err_out;
}
- err = -ENOMEM;
- a = kzalloc(sizeof(*a), GFP_KERNEL);
- if (a == NULL)
- goto err_mod;
-
- a->ops = a_o;
- INIT_LIST_HEAD(&a->list);
/* backward compatibility for policer */
if (name == NULL)
- err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
+ err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind);
else
- err = a_o->init(net, nla, est, a, ovr, bind);
+ err = a_o->init(net, nla, est, &a, ovr, bind);
if (err < 0)
- goto err_free;
+ goto err_mod;
/* module count goes up only when brand new policy is created
* if it exists and is only bound to in a_o->init() then
@@ -604,8 +586,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
return a;
-err_free:
- kfree(a);
err_mod:
module_put(a_o->owner);
err_out:
@@ -641,12 +621,11 @@ err:
return err;
}
-int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
+int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
int compat_mode)
{
int err = 0;
struct gnet_dump d;
- struct tcf_common *p = a->priv;
if (p == NULL)
goto errout;
@@ -655,27 +634,27 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
* to add additional backward compatibility statistic TLVs.
*/
if (compat_mode) {
- if (a->type == TCA_OLD_COMPAT)
+ if (p->type == TCA_OLD_COMPAT)
err = gnet_stats_start_copy_compat(skb, 0,
TCA_STATS,
TCA_XSTATS,
- &p->tcfc_lock, &d,
+ &p->tcfa_lock, &d,
TCA_PAD);
else
return 0;
} else
err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
- &p->tcfc_lock, &d, TCA_ACT_PAD);
+ &p->tcfa_lock, &d, TCA_ACT_PAD);
if (err < 0)
goto errout;
- if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
- &p->tcfc_rate_est) < 0 ||
+ if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
+ &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
- &p->tcfc_qstats,
- p->tcfc_qstats.qlen) < 0)
+ &p->tcfa_qstats,
+ p->tcfa_qstats.qlen) < 0)
goto errout;
if (gnet_stats_finish_copy(&d) < 0)
@@ -687,9 +666,9 @@ errout:
return -1;
}
-static int
-tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq,
- u16 flags, int event, int bind, int ref)
+static int tca_get_fill(struct sk_buff *skb, struct list_head *actions,
+ u32 portid, u32 seq, u16 flags, int event, int bind,
+ int ref)
{
struct tcamsg *t;
struct nlmsghdr *nlh;
@@ -730,7 +709,8 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
- if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
+ 0, 0) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -738,24 +718,11 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
return rtnl_unicast(skb, net, portid);
}
-static struct tc_action *create_a(int i)
-{
- struct tc_action *act;
-
- act = kzalloc(sizeof(*act), GFP_KERNEL);
- if (act == NULL) {
- pr_debug("create_a: failed to alloc!\n");
- return NULL;
- }
- act->order = i;
- INIT_LIST_HEAD(&act->list);
- return act;
-}
-
static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
struct nlmsghdr *n, u32 portid)
{
struct nlattr *tb[TCA_ACT_MAX + 1];
+ const struct tc_action_ops *ops;
struct tc_action *a;
int index;
int err;
@@ -770,40 +737,23 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
goto err_out;
index = nla_get_u32(tb[TCA_ACT_INDEX]);
- err = -ENOMEM;
- a = create_a(0);
- if (a == NULL)
- goto err_out;
-
err = -EINVAL;
- a->ops = tc_lookup_action(tb[TCA_ACT_KIND]);
- if (a->ops == NULL) /* could happen in batch of actions */
- goto err_free;
+ ops = tc_lookup_action(tb[TCA_ACT_KIND]);
+ if (!ops) /* could happen in batch of actions */
+ goto err_out;
err = -ENOENT;
- if (a->ops->lookup(net, a, index) == 0)
+ if (ops->lookup(net, &a, index) == 0)
goto err_mod;
- module_put(a->ops->owner);
+ module_put(ops->owner);
return a;
err_mod:
- module_put(a->ops->owner);
-err_free:
- kfree(a);
+ module_put(ops->owner);
err_out:
return ERR_PTR(err);
}
-static void cleanup_a(struct list_head *actions)
-{
- struct tc_action *a, *tmp;
-
- list_for_each_entry_safe(a, tmp, actions, list) {
- list_del(&a->list);
- kfree(a);
- }
-}
-
static int tca_action_flush(struct net *net, struct nlattr *nla,
struct nlmsghdr *n, u32 portid)
{
@@ -814,8 +764,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
struct netlink_callback dcb;
struct nlattr *nest;
struct nlattr *tb[TCA_ACT_MAX + 1];
+ const struct tc_action_ops *ops;
struct nlattr *kind;
- struct tc_action a;
int err = -ENOMEM;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
@@ -832,13 +782,12 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
err = -EINVAL;
kind = tb[TCA_ACT_KIND];
- memset(&a, 0, sizeof(struct tc_action));
- INIT_LIST_HEAD(&a.list);
- a.ops = tc_lookup_action(kind);
- if (a.ops == NULL) /*some idjot trying to flush unknown action */
+ ops = tc_lookup_action(kind);
+ if (!ops) /*some idjot trying to flush unknown action */
goto err_out;
- nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0);
+ nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION,
+ sizeof(*t), 0);
if (!nlh)
goto out_module_put;
t = nlmsg_data(nlh);
@@ -850,7 +799,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
if (nest == NULL)
goto out_module_put;
- err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a);
+ err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
if (err < 0)
goto out_module_put;
if (err == 0)
@@ -860,7 +809,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
nlh->nlmsg_flags |= NLM_F_ROOT;
- module_put(a.ops->owner);
+ module_put(ops->owner);
err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
n->nlmsg_flags & NLM_F_ECHO);
if (err > 0)
@@ -869,7 +818,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
return err;
out_module_put:
- module_put(a.ops->owner);
+ module_put(ops->owner);
err_out:
noflush_out:
kfree_skb(skb);
@@ -946,7 +895,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
return ret;
}
err:
- cleanup_a(&actions);
+ tcf_action_destroy(&actions, 0);
return ret;
}
@@ -983,15 +932,9 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions);
if (ret)
- goto done;
+ return ret;
- /* dump then free all the actions after update; inserted policy
- * stays intact
- */
- ret = tcf_add_notify(net, n, &actions, portid);
- cleanup_a(&actions);
-done:
- return ret;
+ return tcf_add_notify(net, n, &actions, portid);
}
static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
@@ -1001,7 +944,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
u32 portid = skb ? NETLINK_CB(skb).portid : 0;
int ret = 0, ovr = 0;
- if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN))
+ if ((n->nlmsg_type != RTM_GETACTION) &&
+ !netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
@@ -1080,7 +1024,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
struct tc_action_ops *a_o;
- struct tc_action a;
int ret = 0;
struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
struct nlattr *kind = find_dump_kind(cb->nlh);
@@ -1094,9 +1037,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
if (a_o == NULL)
return 0;
- memset(&a, 0, sizeof(struct tc_action));
- a.ops = a_o;
-
nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
cb->nlh->nlmsg_type, sizeof(*t), 0);
if (!nlh)
@@ -1110,7 +1050,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
if (nest == NULL)
goto out_module_put;
- ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a);
+ ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o);
if (ret < 0)
goto out_module_put;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c7123e01c..bfa870731 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -34,11 +34,12 @@ struct tcf_bpf_cfg {
};
static int bpf_net_id;
+static struct tc_action_ops act_bpf_ops;
static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
struct tcf_result *res)
{
- struct tcf_bpf *prog = act->priv;
+ struct tcf_bpf *prog = to_bpf(act);
struct bpf_prog *filter;
int action, filter_res;
bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
@@ -134,7 +135,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
int bind, int ref)
{
unsigned char *tp = skb_tail_pointer(skb);
- struct tcf_bpf *prog = act->priv;
+ struct tcf_bpf *prog = to_bpf(act);
struct tc_act_bpf opt = {
.index = prog->tcf_index,
.refcnt = prog->tcf_refcnt - ref,
@@ -154,10 +155,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
if (ret)
goto nla_put_failure;
- tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install);
- tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse);
- tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires);
-
+ tcf_tm_dump(&tm, &prog->tcf_tm);
if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm,
TCA_ACT_BPF_PAD))
goto nla_put_failure;
@@ -172,7 +170,8 @@ nla_put_failure:
static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = {
[TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) },
[TCA_ACT_BPF_FD] = { .type = NLA_U32 },
- [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN },
+ [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING,
+ .len = ACT_BPF_NAME_LEN },
[TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 },
[TCA_ACT_BPF_OPS] = { .type = NLA_BINARY,
.len = sizeof(struct sock_filter) * BPF_MAXINSNS },
@@ -225,15 +224,10 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]);
- fp = bpf_prog_get(bpf_fd);
+ fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT);
if (IS_ERR(fp))
return PTR_ERR(fp);
- if (fp->type != BPF_PROG_TYPE_SCHED_ACT) {
- bpf_prog_put(fp);
- return -EINVAL;
- }
-
if (tb[TCA_ACT_BPF_NAME]) {
name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]),
nla_len(tb[TCA_ACT_BPF_NAME]),
@@ -277,7 +271,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
}
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *act,
+ struct nlattr *est, struct tc_action **act,
int replace, int bind)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
@@ -302,7 +296,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(tn, parm->index, act, bind)) {
ret = tcf_hash_create(tn, parm->index, est, act,
- sizeof(*prog), bind, true);
+ &act_bpf_ops, bind, true);
if (ret < 0)
return ret;
@@ -312,7 +306,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (bind)
return 0;
- tcf_hash_release(act, bind);
+ tcf_hash_release(*act, bind);
if (!replace)
return -EEXIST;
}
@@ -332,7 +326,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (ret < 0)
goto out;
- prog = to_bpf(act);
+ prog = to_bpf(*act);
ASSERT_RTNL();
if (res != ACT_P_CREATED)
@@ -350,7 +344,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
rcu_assign_pointer(prog->filter, cfg.filter);
if (res == ACT_P_CREATED) {
- tcf_hash_insert(tn, act);
+ tcf_hash_insert(tn, *act);
} else {
/* make sure the program being replaced is no longer executing */
synchronize_rcu();
@@ -360,7 +354,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
return res;
out:
if (res == ACT_P_CREATED)
- tcf_hash_cleanup(act, est);
+ tcf_hash_cleanup(*act, est);
return ret;
}
@@ -369,20 +363,20 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind)
{
struct tcf_bpf_cfg tmp;
- tcf_bpf_prog_fill_cfg(act->priv, &tmp);
+ tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp);
tcf_bpf_cfg_cleanup(&tmp);
}
static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
@@ -399,6 +393,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
.init = tcf_bpf_init,
.walk = tcf_bpf_walker,
.lookup = tcf_bpf_search,
+ .size = sizeof(struct tcf_bpf),
};
static __net_init int bpf_init_net(struct net *net)
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 2ba700c76..eae07a2e7 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -31,6 +31,7 @@
#define CONNMARK_TAB_MASK 3
static int connmark_net_id;
+static struct tc_action_ops act_connmark_ops;
static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
@@ -38,13 +39,13 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
const struct nf_conntrack_tuple_hash *thash;
struct nf_conntrack_tuple tuple;
enum ip_conntrack_info ctinfo;
- struct tcf_connmark_info *ca = a->priv;
+ struct tcf_connmark_info *ca = to_connmark(a);
struct nf_conntrack_zone zone;
struct nf_conn *c;
int proto;
spin_lock(&ca->tcf_lock);
- ca->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&ca->tcf_tm);
bstats_update(&ca->tcf_bstats, skb);
if (skb->protocol == htons(ETH_P_IP)) {
@@ -96,7 +97,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
};
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
+ struct nlattr *est, struct tc_action **a,
int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -116,22 +117,22 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(tn, parm->index, a, bind)) {
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*ci), bind, false);
+ &act_connmark_ops, bind, false);
if (ret)
return ret;
- ci = to_connmark(a);
+ ci = to_connmark(*a);
ci->tcf_action = parm->action;
ci->net = net;
ci->zone = parm->zone;
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
ret = ACT_P_CREATED;
} else {
- ci = to_connmark(a);
+ ci = to_connmark(*a);
if (bind)
return 0;
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
/* replacing action and zone */
@@ -146,7 +147,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_connmark_info *ci = a->priv;
+ struct tcf_connmark_info *ci = to_connmark(a);
struct tc_connmark opt = {
.index = ci->tcf_index,
@@ -160,9 +161,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(ci->tcf_tm.expires);
+ tcf_tm_dump(&t, &ci->tcf_tm);
if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
TCA_CONNMARK_PAD))
goto nla_put_failure;
@@ -175,14 +174,14 @@ nla_put_failure:
static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -198,6 +197,7 @@ static struct tc_action_ops act_connmark_ops = {
.init = tcf_connmark_init,
.walk = tcf_connmark_walker,
.lookup = tcf_connmark_search,
+ .size = sizeof(struct tcf_connmark_info),
};
static __net_init int connmark_init_net(struct net *net)
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 28e934ed0..b5dbf633a 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -43,9 +43,10 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
};
static int csum_net_id;
+static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a, int ovr,
+ struct nlattr *est, struct tc_action **a, int ovr,
int bind)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
@@ -67,26 +68,26 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(tn, parm->index, a, bind)) {
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*p), bind, false);
+ &act_csum_ops, bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
if (bind)/* dont override defaults */
return 0;
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
}
- p = to_tcf_csum(a);
+ p = to_tcf_csum(*a);
spin_lock_bh(&p->tcf_lock);
p->tcf_action = parm->action;
p->update_flags = parm->update_flags;
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
}
@@ -496,12 +497,12 @@ fail:
static int tcf_csum(struct sk_buff *skb,
const struct tc_action *a, struct tcf_result *res)
{
- struct tcf_csum *p = a->priv;
+ struct tcf_csum *p = to_tcf_csum(a);
int action;
u32 update_flags;
spin_lock(&p->tcf_lock);
- p->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&p->tcf_tm);
bstats_update(&p->tcf_bstats, skb);
action = p->tcf_action;
update_flags = p->update_flags;
@@ -534,7 +535,7 @@ static int tcf_csum_dump(struct sk_buff *skb,
struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_csum *p = a->priv;
+ struct tcf_csum *p = to_tcf_csum(a);
struct tc_csum opt = {
.update_flags = p->update_flags,
.index = p->tcf_index,
@@ -546,9 +547,8 @@ static int tcf_csum_dump(struct sk_buff *skb,
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+
+ tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
goto nla_put_failure;
@@ -561,14 +561,14 @@ nla_put_failure:
static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
@@ -584,6 +584,7 @@ static struct tc_action_ops act_csum_ops = {
.init = tcf_csum_init,
.walk = tcf_csum_walker,
.lookup = tcf_csum_search,
+ .size = sizeof(struct tcf_csum),
};
static __net_init int csum_init_net(struct net *net)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index ec5cc8435..e24a4093d 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -26,6 +26,7 @@
#define GACT_TAB_MASK 15
static int gact_net_id;
+static struct tc_action_ops act_gact_ops;
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
@@ -56,7 +57,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
};
static int tcf_gact_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
+ struct nlattr *est, struct tc_action **a,
int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
@@ -93,19 +94,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(tn, parm->index, a, bind)) {
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*gact), bind, true);
+ &act_gact_ops, bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
if (bind)/* dont override defaults */
return 0;
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
}
- gact = to_gact(a);
+ gact = to_gact(*a);
ASSERT_RTNL();
gact->tcf_action = parm->action;
@@ -121,14 +122,14 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
}
#endif
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
}
static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_gact *gact = a->priv;
+ struct tcf_gact *gact = to_gact(a);
int action = READ_ONCE(gact->tcf_action);
#ifdef CONFIG_GACT_PROB
@@ -151,7 +152,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
u64 lastuse)
{
- struct tcf_gact *gact = a->priv;
+ struct tcf_gact *gact = to_gact(a);
int action = READ_ONCE(gact->tcf_action);
struct tcf_t *tm = &gact->tcf_tm;
@@ -162,10 +163,11 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
tm->lastuse = lastuse;
}
-static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_gact *gact = a->priv;
+ struct tcf_gact *gact = to_gact(a);
struct tc_gact opt = {
.index = gact->tcf_index,
.refcnt = gact->tcf_refcnt - ref,
@@ -188,9 +190,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
goto nla_put_failure;
}
#endif
- t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
+ tcf_tm_dump(&t, &gact->tcf_tm);
if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
goto nla_put_failure;
return skb->len;
@@ -202,14 +202,14 @@ nla_put_failure:
static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
@@ -226,6 +226,7 @@ static struct tc_action_ops act_gact_ops = {
.init = tcf_gact_init,
.walk = tcf_gact_walker,
.lookup = tcf_gact_search,
+ .size = sizeof(struct tcf_gact),
};
static __net_init int gact_init_net(struct net *net)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 5c4cdea21..4a60cd5e1 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -37,6 +37,7 @@
static int ife_net_id;
static int max_metacnt = IFE_META_MAX + 1;
+static struct tc_action_ops act_ife_ops;
static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
[TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)},
@@ -52,7 +53,7 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
u32 *tlv = (u32 *)(skbdata);
u16 totlen = nla_total_size(dlen); /*alignment + hdr */
char *dptr = (char *)tlv + NLA_HDRLEN;
- u32 htlv = attrtype << 16 | dlen;
+ u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
*tlv = htonl(htlv);
memset(dptr, 0, totlen - NLA_HDRLEN);
@@ -364,7 +365,7 @@ out_nlmsg_trim:
/* under ife->tcf_lock */
static void _tcf_ife_cleanup(struct tc_action *a, int bind)
{
- struct tcf_ife_info *ife = a->priv;
+ struct tcf_ife_info *ife = to_ife(a);
struct tcf_meta_info *e, *n;
list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
@@ -382,7 +383,7 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind)
static void tcf_ife_cleanup(struct tc_action *a, int bind)
{
- struct tcf_ife_info *ife = a->priv;
+ struct tcf_ife_info *ife = to_ife(a);
spin_lock_bh(&ife->tcf_lock);
_tcf_ife_cleanup(a, bind);
@@ -417,7 +418,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
}
static int tcf_ife_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
+ struct nlattr *est, struct tc_action **a,
int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
@@ -428,7 +429,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
u16 ife_type = 0;
u8 *daddr = NULL;
u8 *saddr = NULL;
- int ret = 0, exists = 0;
+ bool exists = false;
+ int ret = 0;
int err;
err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
@@ -450,25 +452,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
**/
if (!tb[TCA_IFE_TYPE]) {
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
pr_info("You MUST pass etherype for encoding\n");
return -EINVAL;
}
}
if (!exists) {
- ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife),
+ ret = tcf_hash_create(tn, parm->index, est, a, &act_ife_ops,
bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
}
- ife = to_ife(a);
+ ife = to_ife(*a);
ife->flags = parm->flags;
if (parm->flags & IFE_ENCODE) {
@@ -506,9 +508,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
if (err) {
metadata_parse_err:
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (ret == ACT_P_CREATED)
- _tcf_ife_cleanup(a, bind);
+ _tcf_ife_cleanup(*a, bind);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
@@ -528,7 +530,7 @@ metadata_parse_err:
err = use_all_metadata(ife);
if (err) {
if (ret == ACT_P_CREATED)
- _tcf_ife_cleanup(a, bind);
+ _tcf_ife_cleanup(*a, bind);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
@@ -540,7 +542,7 @@ metadata_parse_err:
spin_unlock_bh(&ife->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
}
@@ -549,7 +551,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_ife_info *ife = a->priv;
+ struct tcf_ife_info *ife = to_ife(a);
struct tc_ife opt = {
.index = ife->tcf_index,
.refcnt = ife->tcf_refcnt - ref,
@@ -562,9 +564,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
+ tcf_tm_dump(&t, &ife->tcf_tm);
if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD))
goto nla_put_failure;
@@ -624,15 +624,15 @@ struct meta_tlvhdr {
static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_ife_info *ife = a->priv;
+ struct tcf_ife_info *ife = to_ife(a);
int action = ife->tcf_action;
struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
- u16 ifehdrln = ifehdr->metalen;
+ int ifehdrln = (int)ifehdr->metalen;
struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
spin_lock(&ife->tcf_lock);
bstats_update(&ife->tcf_bstats, skb);
- ife->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&ife->tcf_tm);
spin_unlock(&ife->tcf_lock);
ifehdrln = ntohs(ifehdrln);
@@ -698,7 +698,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_ife_info *ife = a->priv;
+ struct tcf_ife_info *ife = to_ife(a);
int action = ife->tcf_action;
struct ethhdr *oethh; /* outer ether header */
struct ethhdr *iethh; /* inner eth header */
@@ -722,7 +722,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
spin_lock(&ife->tcf_lock);
bstats_update(&ife->tcf_bstats, skb);
- ife->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&ife->tcf_tm);
if (!metalen) { /* no metadata to send */
/* abuse overlimits to count when we allow packet
@@ -740,8 +740,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
return TC_ACT_SHOT;
}
- iethh = eth_hdr(skb);
-
err = skb_cow_head(skb, hdrm);
if (unlikely(err)) {
ife->tcf_qstats.drops++;
@@ -752,6 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
if (!(at & AT_EGRESS))
skb_push(skb, skb->dev->hard_header_len);
+ iethh = (struct ethhdr *)skb->data;
__skb_push(skb, hdrm);
memcpy(skb->data, iethh, skb->mac_len);
skb_reset_mac_header(skb);
@@ -802,7 +801,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_ife_info *ife = a->priv;
+ struct tcf_ife_info *ife = to_ife(a);
if (ife->flags & IFE_ENCODE)
return tcf_ife_encode(skb, a, res);
@@ -813,7 +812,7 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
pr_info_ratelimited("unknown failure(policy neither de/encode\n");
spin_lock(&ife->tcf_lock);
bstats_update(&ife->tcf_bstats, skb);
- ife->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&ife->tcf_tm);
ife->tcf_qstats.drops++;
spin_unlock(&ife->tcf_lock);
@@ -822,14 +821,14 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
@@ -846,6 +845,7 @@ static struct tc_action_ops act_ife_ops = {
.init = tcf_ife_init,
.walk = tcf_ife_walker,
.lookup = tcf_ife_search,
+ .size = sizeof(struct tcf_ife_info),
};
static __net_init int ife_init_net(struct net *net)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d4bd19ee5..378c1c976 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -31,10 +31,13 @@
#define IPT_TAB_MASK 15
static int ipt_net_id;
+static struct tc_action_ops act_ipt_ops;
static int xt_net_id;
+static struct tc_action_ops act_xt_ops;
-static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
+static int ipt_init_target(struct xt_entry_target *t, char *table,
+ unsigned int hook)
{
struct xt_tgchk_param par;
struct xt_target *target;
@@ -89,14 +92,15 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
};
static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a, int ovr,
- int bind)
+ struct nlattr *est, struct tc_action **a,
+ const struct tc_action_ops *ops, int ovr, int bind)
{
struct nlattr *tb[TCA_IPT_MAX + 1];
struct tcf_ipt *ipt;
struct xt_entry_target *td, *t;
char *tname;
- int ret = 0, err, exists = 0;
+ bool exists = false;
+ int ret = 0, err;
u32 hook = 0;
u32 index = 0;
@@ -116,19 +120,19 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -EINVAL;
}
td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) {
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -EINVAL;
}
if (!exists) {
- ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind,
+ ret = tcf_hash_create(tn, index, est, a, ops, bind,
false);
if (ret)
return ret;
@@ -136,13 +140,11 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
} else {
if (bind)/* dont override defaults */
return 0;
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
}
- ipt = to_ipt(a);
-
hook = nla_get_u32(tb[TCA_IPT_HOOK]);
err = -ENOMEM;
@@ -161,6 +163,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
if (err < 0)
goto err3;
+ ipt = to_ipt(*a);
+
spin_lock_bh(&ipt->tcf_lock);
if (ret != ACT_P_CREATED) {
ipt_destroy_target(ipt->tcfi_t);
@@ -172,7 +176,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
ipt->tcfi_hook = hook;
spin_unlock_bh(&ipt->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
err3:
@@ -181,33 +185,33 @@ err2:
kfree(tname);
err1:
if (ret == ACT_P_CREATED)
- tcf_hash_cleanup(a, est);
+ tcf_hash_cleanup(*a, est);
return err;
}
static int tcf_ipt_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a, int ovr,
+ struct nlattr *est, struct tc_action **a, int ovr,
int bind)
{
struct tc_action_net *tn = net_generic(net, ipt_net_id);
- return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+ return __tcf_ipt_init(tn, nla, est, a, &act_ipt_ops, ovr, bind);
}
static int tcf_xt_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a, int ovr,
+ struct nlattr *est, struct tc_action **a, int ovr,
int bind)
{
struct tc_action_net *tn = net_generic(net, xt_net_id);
- return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+ return __tcf_ipt_init(tn, nla, est, a, &act_xt_ops, ovr, bind);
}
static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
int ret = 0, result = 0;
- struct tcf_ipt *ipt = a->priv;
+ struct tcf_ipt *ipt = to_ipt(a);
struct xt_action_param par;
if (skb_unclone(skb, GFP_ATOMIC))
@@ -215,7 +219,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
spin_lock(&ipt->tcf_lock);
- ipt->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&ipt->tcf_tm);
bstats_update(&ipt->tcf_bstats, skb);
/* yes, we have to worry about both in and out dev
@@ -245,7 +249,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
default:
net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
ret);
- result = TC_POLICE_OK;
+ result = TC_ACT_OK;
break;
}
spin_unlock(&ipt->tcf_lock);
@@ -253,10 +257,11 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
}
-static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_ipt *ipt = a->priv;
+ struct tcf_ipt *ipt = to_ipt(a);
struct xt_entry_target *t;
struct tcf_t tm;
struct tc_cnt c;
@@ -280,11 +285,11 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
goto nla_put_failure;
- tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
- tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
- tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
+
+ tcf_tm_dump(&tm, &ipt->tcf_tm);
if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
goto nla_put_failure;
+
kfree(t);
return skb->len;
@@ -296,14 +301,14 @@ nla_put_failure:
static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, ipt_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, ipt_net_id);
@@ -320,6 +325,7 @@ static struct tc_action_ops act_ipt_ops = {
.init = tcf_ipt_init,
.walk = tcf_ipt_walker,
.lookup = tcf_ipt_search,
+ .size = sizeof(struct tcf_ipt),
};
static __net_init int ipt_init_net(struct net *net)
@@ -345,14 +351,14 @@ static struct pernet_operations ipt_net_ops = {
static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, xt_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, xt_net_id);
@@ -369,6 +375,7 @@ static struct tc_action_ops act_xt_ops = {
.init = tcf_xt_init,
.walk = tcf_xt_walker,
.lookup = tcf_xt_search,
+ .size = sizeof(struct tcf_ipt),
};
static __net_init int xt_init_net(struct net *net)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1f5bd6ccb..6038c85d9 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -52,9 +52,10 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
};
static int mirred_net_id;
+static struct tc_action_ops act_mirred_ops;
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a, int ovr,
+ struct nlattr *est, struct tc_action **a, int ovr,
int bind)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
@@ -62,7 +63,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct tc_mirred *parm;
struct tcf_mirred *m;
struct net_device *dev;
- int ret, ok_push = 0, exists = 0;
+ int ret, ok_push = 0;
+ bool exists = false;
if (nla == NULL)
return -EINVAL;
@@ -83,14 +85,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
break;
default:
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -EINVAL;
}
if (parm->ifindex) {
dev = __dev_get_by_index(net, parm->ifindex);
if (dev == NULL) {
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -ENODEV;
}
switch (dev->type) {
@@ -114,16 +116,16 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (dev == NULL)
return -EINVAL;
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*m), bind, true);
+ &act_mirred_ops, bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
}
- m = to_mirred(a);
+ m = to_mirred(*a);
ASSERT_RTNL();
m->tcf_action = parm->action;
@@ -141,7 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
spin_unlock_bh(&mirred_list_lock);
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
}
return ret;
@@ -150,14 +152,13 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_mirred *m = a->priv;
+ struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
struct sk_buff *skb2;
int retval, err;
u32 at;
tcf_lastuse_update(&m->tcf_tm);
-
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
rcu_read_lock();
@@ -206,7 +207,7 @@ out:
static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_mirred *m = a->priv;
+ struct tcf_mirred *m = to_mirred(a);
struct tc_mirred opt = {
.index = m->tcf_index,
.action = m->tcf_action,
@@ -219,9 +220,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
+
+ tcf_tm_dump(&t, &m->tcf_tm);
if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
goto nla_put_failure;
return skb->len;
@@ -233,14 +233,14 @@ nla_put_failure:
static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
@@ -285,6 +285,7 @@ static struct tc_action_ops act_mirred_ops = {
.init = tcf_mirred_init,
.walk = tcf_mirred_walker,
.lookup = tcf_mirred_search,
+ .size = sizeof(struct tcf_mirred),
};
static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index c0a879f94..8e8b0cc30 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -32,13 +32,14 @@
#define NAT_TAB_MASK 15
static int nat_net_id;
+static struct tc_action_ops act_nat_ops;
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
[TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) },
};
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+ struct tc_action **a, int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
struct nlattr *tb[TCA_NAT_MAX + 1];
@@ -59,18 +60,18 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
if (!tcf_hash_check(tn, parm->index, a, bind)) {
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*p), bind, false);
+ &act_nat_ops, bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
if (bind)
return 0;
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
}
- p = to_tcf_nat(a);
+ p = to_tcf_nat(*a);
spin_lock_bh(&p->tcf_lock);
p->old_addr = parm->old_addr;
@@ -82,7 +83,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
}
@@ -90,7 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_nat *p = a->priv;
+ struct tcf_nat *p = to_tcf_nat(a);
struct iphdr *iph;
__be32 old_addr;
__be32 new_addr;
@@ -103,7 +104,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
spin_lock(&p->tcf_lock);
- p->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&p->tcf_tm);
old_addr = p->old_addr;
new_addr = p->new_addr;
mask = p->mask;
@@ -248,7 +249,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_nat *p = a->priv;
+ struct tcf_nat *p = to_tcf_nat(a);
struct tc_nat opt = {
.old_addr = p->old_addr,
.new_addr = p->new_addr,
@@ -264,9 +265,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+
+ tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
goto nla_put_failure;
@@ -279,14 +279,14 @@ nla_put_failure:
static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
@@ -302,6 +302,7 @@ static struct tc_action_ops act_nat_ops = {
.init = tcf_nat_init,
.walk = tcf_nat_walker,
.lookup = tcf_nat_search,
+ .size = sizeof(struct tcf_nat),
};
static __net_init int nat_init_net(struct net *net)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index c6e18f230..b54d56d49 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -26,13 +26,14 @@
#define PEDIT_TAB_MASK 15
static int pedit_net_id;
+static struct tc_action_ops act_pedit_ops;
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
};
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
+ struct nlattr *est, struct tc_action **a,
int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
@@ -61,23 +62,23 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (!parm->nkeys)
return -EINVAL;
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*p), bind, false);
+ &act_pedit_ops, bind, false);
if (ret)
return ret;
- p = to_pedit(a);
+ p = to_pedit(*a);
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL) {
- tcf_hash_cleanup(a, est);
+ tcf_hash_cleanup(*a, est);
return -ENOMEM;
}
ret = ACT_P_CREATED;
} else {
if (bind)
return 0;
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
- p = to_pedit(a);
+ p = to_pedit(*a);
if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL)
@@ -96,13 +97,13 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
memcpy(p->tcfp_keys, parm->keys, ksize);
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
}
static void tcf_pedit_cleanup(struct tc_action *a, int bind)
{
- struct tcf_pedit *p = a->priv;
+ struct tcf_pedit *p = to_pedit(a);
struct tc_pedit_key *keys = p->tcfp_keys;
kfree(keys);
}
@@ -110,7 +111,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_pedit *p = a->priv;
+ struct tcf_pedit *p = to_pedit(a);
int i;
unsigned int off;
@@ -121,7 +122,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
spin_lock(&p->tcf_lock);
- p->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&p->tcf_tm);
if (p->tcfp_nkeys > 0) {
struct tc_pedit_key *tkey = p->tcfp_keys;
@@ -177,7 +178,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_pedit *p = a->priv;
+ struct tcf_pedit *p = to_pedit(a);
struct tc_pedit *opt;
struct tcf_t t;
int s;
@@ -200,11 +201,11 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+
+ tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
goto nla_put_failure;
+
kfree(opt);
return skb->len;
@@ -216,14 +217,14 @@ nla_put_failure:
static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
@@ -240,6 +241,7 @@ static struct tc_action_ops act_pedit_ops = {
.init = tcf_pedit_init,
.walk = tcf_pedit_walker,
.lookup = tcf_pedit_search,
+ .size = sizeof(struct tcf_pedit),
};
static __net_init int pedit_init_net(struct net *net)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index c55778976..8a3be1d99 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -23,7 +23,7 @@
#include <net/netlink.h>
struct tcf_police {
- struct tcf_common common;
+ struct tc_action common;
int tcfp_result;
u32 tcfp_ewma_rate;
s64 tcfp_burst;
@@ -37,8 +37,8 @@ struct tcf_police {
struct psched_ratecfg peak;
bool peak_present;
};
-#define to_police(pc) \
- container_of(pc->priv, struct tcf_police, common)
+
+#define to_police(pc) ((struct tcf_police *)pc)
#define POL_TAB_MASK 15
@@ -56,56 +56,15 @@ struct tc_police_compat {
/* Each policer is serialized by its individual spinlock */
static int police_net_id;
+static struct tc_action_ops act_police_ops;
static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, police_net_id);
- struct tcf_hashinfo *hinfo = tn->hinfo;
- struct hlist_head *head;
- struct tcf_common *p;
- int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
- struct nlattr *nest;
-
- spin_lock_bh(&hinfo->lock);
-
- s_i = cb->args[0];
-
- for (i = 0; i < (POL_TAB_MASK + 1); i++) {
- head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
-
- hlist_for_each_entry_rcu(p, head, tcfc_head) {
- index++;
- if (index < s_i)
- continue;
- a->priv = p;
- a->order = index;
- nest = nla_nest_start(skb, a->order);
- if (nest == NULL)
- goto nla_put_failure;
- if (type == RTM_DELACTION)
- err = tcf_action_dump_1(skb, a, 0, 1);
- else
- err = tcf_action_dump_1(skb, a, 0, 0);
- if (err < 0) {
- index--;
- nla_nest_cancel(skb, nest);
- goto done;
- }
- nla_nest_end(skb, nest);
- n_i++;
- }
- }
-done:
- spin_unlock_bh(&hinfo->lock);
- if (n_i)
- cb->args[0] += n_i;
- return n_i;
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- goto done;
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
@@ -115,9 +74,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
};
-static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
- int ovr, int bind)
+static int tcf_act_police_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind)
{
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
@@ -125,6 +84,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
struct tc_action_net *tn = net_generic(net, police_net_id);
+ bool exists = false;
int size;
if (nla == NULL)
@@ -139,31 +99,25 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
size = nla_len(tb[TCA_POLICE_TBF]);
if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
return -EINVAL;
+
parm = nla_data(tb[TCA_POLICE_TBF]);
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
- if (parm->index) {
- if (tcf_hash_search(tn, a, parm->index)) {
- police = to_police(a);
- if (bind) {
- police->tcf_bindcnt += 1;
- police->tcf_refcnt += 1;
- return 0;
- }
- if (ovr)
- goto override;
- /* not replacing */
- return -EEXIST;
- }
- } else {
+ if (!exists) {
ret = tcf_hash_create(tn, parm->index, NULL, a,
- sizeof(*police), bind, false);
+ &act_police_ops, bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
+ } else {
+ tcf_hash_release(*a, bind);
+ if (!ovr)
+ return -EEXIST;
}
- police = to_police(a);
-override:
+ police = to_police(*a);
if (parm->rate.rate) {
err = -ENOMEM;
R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
@@ -182,7 +136,8 @@ override:
if (est) {
err = gen_replace_estimator(&police->tcf_bstats, NULL,
&police->tcf_rate_est,
- &police->tcf_lock, est);
+ &police->tcf_lock,
+ NULL, est);
if (err)
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
@@ -234,7 +189,7 @@ override:
return ret;
police->tcfp_t_c = ktime_get_ns();
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
@@ -244,14 +199,14 @@ failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
if (ret == ACT_P_CREATED)
- tcf_hash_cleanup(a, est);
+ tcf_hash_cleanup(*a, est);
return err;
}
static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_police *police = a->priv;
+ struct tcf_police *police = to_police(a);
s64 now;
s64 toks;
s64 ptoks = 0;
@@ -310,7 +265,7 @@ static int
tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_police *police = a->priv;
+ struct tcf_police *police = to_police(a);
struct tc_police opt = {
.index = police->tcf_index,
.action = police->tcf_action,
@@ -336,6 +291,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
+ t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse);
t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
@@ -347,7 +303,7 @@ nla_put_failure:
return -1;
}
-static int tcf_police_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, police_net_id);
@@ -364,9 +320,10 @@ static struct tc_action_ops act_police_ops = {
.owner = THIS_MODULE,
.act = tcf_act_police,
.dump = tcf_act_police_dump,
- .init = tcf_act_police_locate,
+ .init = tcf_act_police_init,
.walk = tcf_act_police_walker,
.lookup = tcf_police_search,
+ .size = sizeof(struct tcf_police),
};
static __net_init int police_init_net(struct net *net)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index e42f8daca..289af6f9b 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -27,15 +27,16 @@
#define SIMP_TAB_MASK 7
static int simp_net_id;
+static struct tc_action_ops act_simp_ops;
#define SIMP_MAX_DATA 32
static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_defact *d = a->priv;
+ struct tcf_defact *d = to_defact(a);
spin_lock(&d->tcf_lock);
- d->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&d->tcf_tm);
bstats_update(&d->tcf_bstats, skb);
/* print policy string followed by _ then packet count
@@ -79,15 +80,16 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
};
static int tcf_simp_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
+ struct nlattr *est, struct tc_action **a,
int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
struct tc_defact *parm;
struct tcf_defact *d;
+ bool exists = false;
+ int ret = 0, err;
char *defdata;
- int ret = 0, err, exists = 0;
if (nla == NULL)
return -EINVAL;
@@ -99,7 +101,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
if (tb[TCA_DEF_PARMS] == NULL)
return -EINVAL;
-
parm = nla_data(tb[TCA_DEF_PARMS]);
exists = tcf_hash_check(tn, parm->index, a, bind);
if (exists && bind)
@@ -107,7 +108,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
if (tb[TCA_DEF_DATA] == NULL) {
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -EINVAL;
}
@@ -115,22 +116,22 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*d), bind, false);
+ &act_simp_ops, bind, false);
if (ret)
return ret;
- d = to_defact(a);
+ d = to_defact(*a);
ret = alloc_defdata(d, defdata);
if (ret < 0) {
- tcf_hash_cleanup(a, est);
+ tcf_hash_cleanup(*a, est);
return ret;
}
d->tcf_action = parm->action;
ret = ACT_P_CREATED;
} else {
- d = to_defact(a);
+ d = to_defact(*a);
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
@@ -138,7 +139,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
}
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
}
@@ -146,7 +147,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_defact *d = a->priv;
+ struct tcf_defact *d = to_defact(a);
struct tc_defact opt = {
.index = d->tcf_index,
.refcnt = d->tcf_refcnt - ref,
@@ -158,9 +159,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+
+ tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
goto nla_put_failure;
return skb->len;
@@ -172,14 +172,14 @@ nla_put_failure:
static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
@@ -196,6 +196,7 @@ static struct tc_action_ops act_simp_ops = {
.init = tcf_simp_init,
.walk = tcf_simp_walker,
.lookup = tcf_simp_search,
+ .size = sizeof(struct tcf_defact),
};
static __net_init int simp_init_net(struct net *net)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index e92880296..a133dcb82 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -30,14 +30,15 @@
#define SKBEDIT_TAB_MASK 15
static int skbedit_net_id;
+static struct tc_action_ops act_skbedit_ops;
static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_skbedit *d = a->priv;
+ struct tcf_skbedit *d = to_skbedit(a);
spin_lock(&d->tcf_lock);
- d->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&d->tcf_tm);
bstats_update(&d->tcf_bstats, skb);
if (d->flags & SKBEDIT_F_PRIORITY)
@@ -47,6 +48,8 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
skb_set_queue_mapping(skb, d->queue_mapping);
if (d->flags & SKBEDIT_F_MARK)
skb->mark = d->mark;
+ if (d->flags & SKBEDIT_F_PTYPE)
+ skb->pkt_type = d->ptype;
spin_unlock(&d->tcf_lock);
return d->tcf_action;
@@ -57,10 +60,11 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
[TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
+ [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
};
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
+ struct nlattr *est, struct tc_action **a,
int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
@@ -68,8 +72,9 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct tc_skbedit *parm;
struct tcf_skbedit *d;
u32 flags = 0, *priority = NULL, *mark = NULL;
- u16 *queue_mapping = NULL;
- int ret = 0, err, exists = 0;
+ u16 *queue_mapping = NULL, *ptype = NULL;
+ bool exists = false;
+ int ret = 0, err;
if (nla == NULL)
return -EINVAL;
@@ -91,6 +96,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
}
+ if (tb[TCA_SKBEDIT_PTYPE] != NULL) {
+ ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]);
+ if (!skb_pkt_type_ok(*ptype))
+ return -EINVAL;
+ flags |= SKBEDIT_F_PTYPE;
+ }
+
if (tb[TCA_SKBEDIT_MARK] != NULL) {
flags |= SKBEDIT_F_MARK;
mark = nla_data(tb[TCA_SKBEDIT_MARK]);
@@ -103,21 +115,21 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
return 0;
if (!flags) {
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -EINVAL;
}
if (!exists) {
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*d), bind, false);
+ &act_skbedit_ops, bind, false);
if (ret)
return ret;
- d = to_skbedit(a);
+ d = to_skbedit(*a);
ret = ACT_P_CREATED;
} else {
- d = to_skbedit(a);
- tcf_hash_release(a, bind);
+ d = to_skbedit(*a);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
}
@@ -131,13 +143,15 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
d->queue_mapping = *queue_mapping;
if (flags & SKBEDIT_F_MARK)
d->mark = *mark;
+ if (flags & SKBEDIT_F_PTYPE)
+ d->ptype = *ptype;
d->tcf_action = parm->action;
spin_unlock_bh(&d->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
}
@@ -145,7 +159,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_skbedit *d = a->priv;
+ struct tcf_skbedit *d = to_skbedit(a);
struct tc_skbedit opt = {
.index = d->tcf_index,
.refcnt = d->tcf_refcnt - ref,
@@ -157,20 +171,19 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_PRIORITY) &&
- nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
- &d->priority))
+ nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
- nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING,
- sizeof(d->queue_mapping), &d->queue_mapping))
+ nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_MARK) &&
- nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
- &d->mark))
+ nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+ if ((d->flags & SKBEDIT_F_PTYPE) &&
+ nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
goto nla_put_failure;
return skb->len;
@@ -182,14 +195,14 @@ nla_put_failure:
static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
@@ -205,6 +218,7 @@ static struct tc_action_ops act_skbedit_ops = {
.init = tcf_skbedit_init,
.walk = tcf_skbedit_walker,
.lookup = tcf_skbedit_search,
+ .size = sizeof(struct tcf_skbedit),
};
static __net_init int skbedit_init_net(struct net *net)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index ac4adc812..691409de3 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -22,16 +22,17 @@
#define VLAN_TAB_MASK 15
static int vlan_net_id;
+static struct tc_action_ops act_vlan_ops;
static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- struct tcf_vlan *v = a->priv;
+ struct tcf_vlan *v = to_vlan(a);
int action;
int err;
spin_lock(&v->tcf_lock);
- v->tcf_tm.lastuse = jiffies;
+ tcf_lastuse_update(&v->tcf_tm);
bstats_update(&v->tcf_bstats, skb);
action = v->tcf_action;
@@ -67,7 +68,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
};
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
+ struct nlattr *est, struct tc_action **a,
int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
@@ -77,8 +78,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
int action;
__be16 push_vid = 0;
__be16 push_proto = 0;
- int ret = 0, exists = 0;
- int err;
+ bool exists = false;
+ int ret = 0, err;
if (!nla)
return -EINVAL;
@@ -100,13 +101,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
case TCA_VLAN_ACT_PUSH:
if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -EINVAL;
}
push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
if (push_vid >= VLAN_VID_MASK) {
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -ERANGE;
}
@@ -125,25 +126,25 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
break;
default:
if (exists)
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
return -EINVAL;
}
action = parm->v_action;
if (!exists) {
ret = tcf_hash_create(tn, parm->index, est, a,
- sizeof(*v), bind, false);
+ &act_vlan_ops, bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
- tcf_hash_release(a, bind);
+ tcf_hash_release(*a, bind);
if (!ovr)
return -EEXIST;
}
- v = to_vlan(a);
+ v = to_vlan(*a);
spin_lock_bh(&v->tcf_lock);
@@ -156,7 +157,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
spin_unlock_bh(&v->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(tn, a);
+ tcf_hash_insert(tn, *a);
return ret;
}
@@ -164,7 +165,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_vlan *v = a->priv;
+ struct tcf_vlan *v = to_vlan(a);
struct tc_vlan opt = {
.index = v->tcf_index,
.refcnt = v->tcf_refcnt - ref,
@@ -179,12 +180,11 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
(nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
- nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto)))
+ nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
+ v->tcfv_push_proto)))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse);
- t.expires = jiffies_to_clock_t(v->tcf_tm.expires);
+ tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
return skb->len;
@@ -196,14 +196,14 @@ nla_put_failure:
static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
- struct tc_action *a)
+ const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
- return tcf_generic_walker(tn, skb, cb, type, a);
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
-static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
@@ -219,6 +219,7 @@ static struct tc_action_ops act_vlan_ops = {
.init = tcf_vlan_init,
.walk = tcf_vlan_walker,
.lookup = tcf_vlan_search,
+ .size = sizeof(struct tcf_vlan),
};
static __net_init int vlan_init_net(struct net *net)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a75864d93..a7c564537 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -103,6 +103,17 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
unsigned long fh, int event);
+static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n,
+ struct tcf_proto __rcu **chain, int event)
+{
+ struct tcf_proto __rcu **it_chain;
+ struct tcf_proto *tp;
+
+ for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL;
+ it_chain = &tp->next)
+ tfilter_notify(net, oskb, n, tp, 0, event);
+}
/* Select new prio value from the range, managed by kernel. */
@@ -156,11 +167,23 @@ replay:
cl = 0;
if (prio == 0) {
- /* If no priority is given, user wants we allocated it. */
- if (n->nlmsg_type != RTM_NEWTFILTER ||
- !(n->nlmsg_flags & NLM_F_CREATE))
+ switch (n->nlmsg_type) {
+ case RTM_DELTFILTER:
+ if (protocol || t->tcm_handle || tca[TCA_KIND])
+ return -ENOENT;
+ break;
+ case RTM_NEWTFILTER:
+ /* If no priority is provided by the user,
+ * we allocate one.
+ */
+ if (n->nlmsg_flags & NLM_F_CREATE) {
+ prio = TC_H_MAKE(0x80000000U, 0U);
+ break;
+ }
+ /* fall-through */
+ default:
return -ENOENT;
- prio = TC_H_MAKE(0x80000000U, 0U);
+ }
}
/* Find head of filter chain. */
@@ -200,6 +223,12 @@ replay:
err = -EINVAL;
if (chain == NULL)
goto errout;
+ if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
+ tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
+ tcf_destroy_chain(chain);
+ err = 0;
+ goto errout;
+ }
/* Check the chain for existence of proto-tcf with this priority */
for (back = chain;
@@ -351,8 +380,9 @@ errout:
return err;
}
-static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,
- unsigned long fh, u32 portid, u32 seq, u16 flags, int event)
+static int tcf_fill_node(struct net *net, struct sk_buff *skb,
+ struct tcf_proto *tp, unsigned long fh, u32 portid,
+ u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -474,9 +504,11 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
TC_H_MIN(tcm->tcm_info) != tp->protocol)
continue;
if (t > s_t)
- memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+ memset(&cb->args[1], 0,
+ sizeof(cb->args)-sizeof(cb->args[0]));
if (cb->args[1] == 0) {
- if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,
+ if (tcf_fill_node(net, skb, tp, 0,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER) <= 0)
break;
@@ -509,8 +541,12 @@ out:
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
- INIT_LIST_HEAD(&exts->actions);
+ LIST_HEAD(actions);
+
+ tcf_exts_to_list(exts, &actions);
+ tcf_action_destroy(&actions, TCA_ACT_UNBIND);
+ kfree(exts->actions);
+ exts->nr_actions = 0;
#endif
}
EXPORT_SYMBOL(tcf_exts_destroy);
@@ -522,7 +558,6 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
{
struct tc_action *act;
- INIT_LIST_HEAD(&exts->actions);
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
"police", ovr,
@@ -531,14 +566,20 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
return PTR_ERR(act);
act->type = exts->type = TCA_OLD_COMPAT;
- list_add(&act->list, &exts->actions);
+ exts->actions[0] = act;
+ exts->nr_actions = 1;
} else if (exts->action && tb[exts->action]) {
- int err;
+ LIST_HEAD(actions);
+ int err, i = 0;
+
err = tcf_action_init(net, tb[exts->action], rate_tlv,
NULL, ovr,
- TCA_ACT_BIND, &exts->actions);
+ TCA_ACT_BIND, &actions);
if (err)
return err;
+ list_for_each_entry(act, &actions, list)
+ exts->actions[i++] = act;
+ exts->nr_actions = i;
}
}
#else
@@ -555,37 +596,49 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
- LIST_HEAD(tmp);
+ struct tcf_exts old = *dst;
+
tcf_tree_lock(tp);
- list_splice_init(&dst->actions, &tmp);
- list_splice(&src->actions, &dst->actions);
+ dst->nr_actions = src->nr_actions;
+ dst->actions = src->actions;
dst->type = src->type;
tcf_tree_unlock(tp);
- tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
+
+ tcf_exts_destroy(&old);
#endif
}
EXPORT_SYMBOL(tcf_exts_change);
-#define tcf_exts_first_act(ext) \
- list_first_entry_or_null(&(exts)->actions, \
- struct tc_action, list)
+#ifdef CONFIG_NET_CLS_ACT
+static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts)
+{
+ if (exts->nr_actions == 0)
+ return NULL;
+ else
+ return exts->actions[0];
+}
+#endif
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
struct nlattr *nest;
- if (exts->action && !list_empty(&exts->actions)) {
+ if (exts->action && exts->nr_actions) {
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
* tc data even if iproute2 was newer - jhs
*/
if (exts->type != TCA_OLD_COMPAT) {
+ LIST_HEAD(actions);
+
nest = nla_nest_start(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
- if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
+
+ tcf_exts_to_list(exts, &actions);
+ if (tcf_action_dump(skb, &actions, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 7b342c779..c3002c2c6 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -272,15 +272,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
- fp = bpf_prog_get(bpf_fd);
+ fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
if (IS_ERR(fp))
return PTR_ERR(fp);
- if (fp->type != BPF_PROG_TYPE_SCHED_CLS) {
- bpf_prog_put(fp);
- return -EINVAL;
- }
-
if (tb[TCA_BPF_NAME]) {
name = kmemdup(nla_data(tb[TCA_BPF_NAME]),
nla_len(tb[TCA_BPF_NAME]),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b3b7978f4..5060801a2 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -66,6 +66,7 @@ struct cls_fl_filter {
struct fl_flow_key key;
struct list_head list;
u32 handle;
+ u32 flags;
struct rcu_head rcu;
};
@@ -123,6 +124,9 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+ if (!atomic_read(&head->ht.nelems))
+ return -1;
+
fl_clear_masked_range(&skb_key, &head->mask);
skb_key.indev_ifindex = skb->skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
@@ -136,7 +140,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
f = rhashtable_lookup_fast(&head->ht,
fl_key_get_start(&skb_mkey, &head->mask),
head->ht_params);
- if (f) {
+ if (f && !tc_skip_sw(f->flags)) {
*res = f->res;
return tcf_exts_exec(skb, &f->exts, res);
}
@@ -183,19 +187,20 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
}
-static void fl_hw_replace_filter(struct tcf_proto *tp,
- struct flow_dissector *dissector,
- struct fl_flow_key *mask,
- struct fl_flow_key *key,
- struct tcf_exts *actions,
- unsigned long cookie, u32 flags)
+static int fl_hw_replace_filter(struct tcf_proto *tp,
+ struct flow_dissector *dissector,
+ struct fl_flow_key *mask,
+ struct fl_flow_key *key,
+ struct tcf_exts *actions,
+ unsigned long cookie, u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
+ int err;
if (!tc_should_offload(dev, tp, flags))
- return;
+ return tc_skip_sw(flags) ? -EINVAL : 0;
offload.command = TC_CLSFLOWER_REPLACE;
offload.cookie = cookie;
@@ -207,7 +212,12 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
tc.type = TC_SETUP_CLSFLOWER;
tc.cls_flower = &offload;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+
+ if (tc_skip_sw(flags))
+ return err;
+
+ return 0;
}
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
@@ -524,7 +534,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
struct cls_fl_filter *fnew;
struct nlattr *tb[TCA_FLOWER_MAX + 1];
struct fl_flow_mask mask = {};
- u32 flags = 0;
int err;
if (!tca[TCA_OPTIONS])
@@ -552,8 +561,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
}
fnew->handle = handle;
- if (tb[TCA_FLOWER_FLAGS])
- flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+ if (tb[TCA_FLOWER_FLAGS]) {
+ fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
+ if (!tc_flags_valid(fnew->flags)) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
if (err)
@@ -563,19 +578,23 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (err)
goto errout;
- err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
- head->ht_params);
+ if (!tc_skip_sw(fnew->flags)) {
+ err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
+ head->ht_params);
+ if (err)
+ goto errout;
+ }
+
+ err = fl_hw_replace_filter(tp,
+ &head->dissector,
+ &mask.key,
+ &fnew->key,
+ &fnew->exts,
+ (unsigned long)fnew,
+ fnew->flags);
if (err)
goto errout;
- fl_hw_replace_filter(tp,
- &head->dissector,
- &mask.key,
- &fnew->key,
- &fnew->exts,
- (unsigned long)fnew,
- flags);
-
if (fold) {
rhashtable_remove_fast(&head->ht, &fold->ht_node,
head->ht_params);
@@ -734,6 +753,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
sizeof(key->tp.dst))))
goto nla_put_failure;
+ nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
+
if (tcf_exts_dump(skb, &f->exts))
goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
new file mode 100644
index 000000000..25927b6c4
--- /dev/null
+++ b/net/sched/cls_matchall.c
@@ -0,0 +1,318 @@
+/*
+ * net/sched/cls_matchll.c Match-all classifier
+ *
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+
+struct cls_mall_filter {
+ struct tcf_exts exts;
+ struct tcf_result res;
+ u32 handle;
+ struct rcu_head rcu;
+ u32 flags;
+};
+
+struct cls_mall_head {
+ struct cls_mall_filter *filter;
+ struct rcu_head rcu;
+};
+
+static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_mall_head *head = rcu_dereference_bh(tp->root);
+ struct cls_mall_filter *f = head->filter;
+
+ if (tc_skip_sw(f->flags))
+ return -1;
+
+ return tcf_exts_exec(skb, &f->exts, res);
+}
+
+static int mall_init(struct tcf_proto *tp)
+{
+ struct cls_mall_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ return -ENOBUFS;
+
+ rcu_assign_pointer(tp->root, head);
+
+ return 0;
+}
+
+static void mall_destroy_filter(struct rcu_head *head)
+{
+ struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu);
+
+ tcf_exts_destroy(&f->exts);
+
+ kfree(f);
+}
+
+static int mall_replace_hw_filter(struct tcf_proto *tp,
+ struct cls_mall_filter *f,
+ unsigned long cookie)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_to_netdev offload;
+ struct tc_cls_matchall_offload mall_offload = {0};
+
+ offload.type = TC_SETUP_MATCHALL;
+ offload.cls_mall = &mall_offload;
+ offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
+ offload.cls_mall->exts = &f->exts;
+ offload.cls_mall->cookie = cookie;
+
+ return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+ &offload);
+}
+
+static void mall_destroy_hw_filter(struct tcf_proto *tp,
+ struct cls_mall_filter *f,
+ unsigned long cookie)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_to_netdev offload;
+ struct tc_cls_matchall_offload mall_offload = {0};
+
+ offload.type = TC_SETUP_MATCHALL;
+ offload.cls_mall = &mall_offload;
+ offload.cls_mall->command = TC_CLSMATCHALL_DESTROY;
+ offload.cls_mall->exts = NULL;
+ offload.cls_mall->cookie = cookie;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+ &offload);
+}
+
+static bool mall_destroy(struct tcf_proto *tp, bool force)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct cls_mall_filter *f = head->filter;
+
+ if (!force && f)
+ return false;
+
+ if (f) {
+ if (tc_should_offload(dev, tp, f->flags))
+ mall_destroy_hw_filter(tp, f, (unsigned long) f);
+
+ call_rcu(&f->rcu, mall_destroy_filter);
+ }
+ RCU_INIT_POINTER(tp->root, NULL);
+ kfree_rcu(head, rcu);
+ return true;
+}
+
+static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct cls_mall_filter *f = head->filter;
+
+ if (f && f->handle == handle)
+ return (unsigned long) f;
+ return 0;
+}
+
+static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
+ [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC },
+ [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 },
+};
+
+static int mall_set_parms(struct net *net, struct tcf_proto *tp,
+ struct cls_mall_filter *f,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est, bool ovr)
+{
+ struct tcf_exts e;
+ int err;
+
+ tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_MATCHALL_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ tcf_exts_change(tp, &f->exts, &e);
+
+ return 0;
+}
+
+static int mall_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg;
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct cls_mall_filter *f;
+ struct nlattr *tb[TCA_MATCHALL_MAX + 1];
+ u32 flags = 0;
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return -EINVAL;
+
+ if (head->filter)
+ return -EBUSY;
+
+ if (fold)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_MATCHALL_MAX,
+ tca[TCA_OPTIONS], mall_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_MATCHALL_FLAGS]) {
+ flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
+ if (!tc_flags_valid(flags))
+ return -EINVAL;
+ }
+
+ f = kzalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ return -ENOBUFS;
+
+ tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0);
+
+ if (!handle)
+ handle = 1;
+ f->handle = handle;
+ f->flags = flags;
+
+ err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
+ if (err)
+ goto errout;
+
+ if (tc_should_offload(dev, tp, flags)) {
+ err = mall_replace_hw_filter(tp, f, (unsigned long) f);
+ if (err) {
+ if (tc_skip_sw(flags))
+ goto errout;
+ else
+ err = 0;
+ }
+ }
+
+ *arg = (unsigned long) f;
+ rcu_assign_pointer(head->filter, f);
+
+ return 0;
+
+errout:
+ kfree(f);
+ return err;
+}
+
+static int mall_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct cls_mall_filter *f = (struct cls_mall_filter *) arg;
+ struct net_device *dev = tp->q->dev_queue->dev;
+
+ if (tc_should_offload(dev, tp, f->flags))
+ mall_destroy_hw_filter(tp, f, (unsigned long) f);
+
+ RCU_INIT_POINTER(head->filter, NULL);
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, mall_destroy_filter);
+ return 0;
+}
+
+static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct cls_mall_filter *f = head->filter;
+
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, (unsigned long) f, arg) < 0)
+ arg->stop = 1;
+skip:
+ arg->count++;
+}
+
+static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct cls_mall_filter *f = (struct cls_mall_filter *) fh;
+ struct nlattr *nest;
+
+ if (!f)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_MATCHALL_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_mall_ops __read_mostly = {
+ .kind = "matchall",
+ .classify = mall_classify,
+ .init = mall_init,
+ .destroy = mall_destroy,
+ .get = mall_get,
+ .change = mall_change,
+ .delete = mall_delete,
+ .walk = mall_walk,
+ .dump = mall_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init cls_mall_init(void)
+{
+ return register_tcf_proto_ops(&cls_mall_ops);
+}
+
+static void __exit cls_mall_exit(void)
+{
+ unregister_tcf_proto_ops(&cls_mall_ops);
+}
+
+module_init(cls_mall_init);
+module_exit(cls_mall_exit);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Match-all classifier");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ddf047df5..12ebde845 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -95,8 +95,6 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
Expected action: do not backoff, but wait until queue will clear.
NET_XMIT_CN - probably this packet enqueued, but another one dropped.
Expected action: backoff or ignore
- NET_XMIT_POLICED - dropped by police.
- Expected action: backoff or error to real-time apps.
Auxiliary routines:
@@ -583,7 +581,6 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
timer);
rcu_read_lock();
- qdisc_unthrottled(wd->qdisc);
__netif_schedule(qdisc_root(wd->qdisc));
rcu_read_unlock();
@@ -598,15 +595,12 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
}
EXPORT_SYMBOL(qdisc_watchdog_init);
-void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
+void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
{
if (test_bit(__QDISC_STATE_DEACTIVATED,
&qdisc_root_sleeping(wd->qdisc)->state))
return;
- if (throttle)
- qdisc_throttled(wd->qdisc);
-
if (wd->last_expires == expires)
return;
@@ -620,7 +614,6 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
hrtimer_cancel(&wd->timer);
- qdisc_unthrottled(wd->qdisc);
}
EXPORT_SYMBOL(qdisc_watchdog_cancel);
@@ -982,7 +975,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
rcu_assign_pointer(sch->stab, stab);
}
if (tca[TCA_RATE]) {
- spinlock_t *root_lock;
+ seqcount_t *running;
err = -EOPNOTSUPP;
if (sch->flags & TCQ_F_MQROOT)
@@ -991,14 +984,15 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
if ((sch->parent != TC_H_ROOT) &&
!(sch->flags & TCQ_F_INGRESS) &&
(!p || !(p->flags & TCQ_F_MQROOT)))
- root_lock = qdisc_root_sleeping_lock(sch);
+ running = qdisc_root_sleeping_running(sch);
else
- root_lock = qdisc_lock(sch);
+ running = &sch->running;
err = gen_new_estimator(&sch->bstats,
sch->cpu_bstats,
&sch->rate_est,
- root_lock,
+ NULL,
+ running,
tca[TCA_RATE]);
if (err)
goto err_out4;
@@ -1061,7 +1055,8 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
gen_replace_estimator(&sch->bstats,
sch->cpu_bstats,
&sch->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
}
out:
@@ -1369,8 +1364,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
goto nla_put_failure;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
- qdisc_root_sleeping_lock(q), &d,
- TCA_PAD) < 0)
+ NULL, &d, TCA_PAD) < 0)
goto nla_put_failure;
if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
@@ -1381,7 +1375,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
cpu_qstats = q->cpu_qstats;
}
- if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
+ &d, cpu_bstats, &q->bstats) < 0 ||
gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
@@ -1684,8 +1679,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
goto nla_put_failure;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
- qdisc_root_sleeping_lock(q), &d,
- TCA_PAD) < 0)
+ NULL, &d, TCA_PAD) < 0)
goto nla_put_failure;
if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 1911af3ca..481e4f12a 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -357,16 +357,17 @@ static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch,
/* --------------------------- Qdisc operations ---------------------------- */
-static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
struct tcf_result res;
int result;
- int ret = NET_XMIT_POLICED;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
- result = TC_POLICE_OK; /* be nice to gcc */
+ result = TC_ACT_OK; /* be nice to gcc */
flow = NULL;
if (TC_H_MAJ(skb->priority) != sch->handle ||
!(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
@@ -398,12 +399,12 @@ done:
switch (result) {
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
goto drop;
- case TC_POLICE_RECLASSIFY:
+ case TC_ACT_RECLASSIFY:
if (flow->excess)
flow = flow->excess;
else
@@ -413,7 +414,7 @@ done:
#endif
}
- ret = qdisc_enqueue(skb, flow->q);
+ ret = qdisc_enqueue(skb, flow->q, to_free);
if (ret != NET_XMIT_SUCCESS) {
drop: __maybe_unused
if (net_xmit_drop_count(ret)) {
@@ -519,20 +520,6 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
return p->link.q->ops->peek(p->link.q);
}
-static unsigned int atm_tc_drop(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
- unsigned int len;
-
- pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
- list_for_each_entry(flow, &p->flows, list) {
- if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
- return len;
- }
- return 0;
-}
-
static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
@@ -637,7 +624,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
{
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
- if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &flow->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
return -1;
@@ -671,7 +659,6 @@ static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
.enqueue = atm_tc_enqueue,
.dequeue = atm_tc_dequeue,
.peek = atm_tc_peek,
- .drop = atm_tc_drop,
.init = atm_tc_init,
.reset = atm_tc_reset,
.destroy = atm_tc_destroy,
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
index 3fee70d98..c98a61e98 100644
--- a/net/sched/sch_blackhole.c
+++ b/net/sched/sch_blackhole.c
@@ -17,9 +17,10 @@
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
-static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_SUCCESS;
}
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index baafddf22..beb554aa8 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -80,10 +80,6 @@ struct cbq_class {
unsigned char priority; /* class priority */
unsigned char priority2; /* priority to be used after overlimit */
unsigned char ewma_log; /* time constant for idle time calculation */
- unsigned char ovl_strategy;
-#ifdef CONFIG_NET_CLS_ACT
- unsigned char police;
-#endif
u32 defmap;
@@ -94,10 +90,6 @@ struct cbq_class {
u32 avpkt;
struct qdisc_rate_table *R_tab;
- /* Overlimit strategy parameters */
- void (*overlimit)(struct cbq_class *cl);
- psched_tdiff_t penalty;
-
/* General scheduler (WRR) parameters */
long allot;
long quantum; /* Allotment per WRR round */
@@ -353,7 +345,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
{
int toplevel = q->toplevel;
- if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
+ if (toplevel > cl->level) {
psched_time_t now = psched_get_time();
do {
@@ -366,7 +358,8 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
}
static int
-cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct cbq_sched_data *q = qdisc_priv(sch);
int uninitialized_var(ret);
@@ -378,14 +371,11 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl == NULL) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
-#ifdef CONFIG_NET_CLS_ACT
- cl->q->__parent = sch;
-#endif
- ret = qdisc_enqueue(skb, cl->q);
+ ret = qdisc_enqueue(skb, cl->q, to_free);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
cbq_mark_toplevel(q, cl);
@@ -402,11 +392,8 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return ret;
}
-/* Overlimit actions */
-
-/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
-
-static void cbq_ovl_classic(struct cbq_class *cl)
+/* Overlimit action: penalize leaf class by adding offtime */
+static void cbq_overlimit(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
psched_tdiff_t delay = cl->undertime - q->now;
@@ -456,99 +443,6 @@ static void cbq_ovl_classic(struct cbq_class *cl)
}
}
-/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
- * they go overlimit
- */
-
-static void cbq_ovl_rclassic(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- struct cbq_class *this = cl;
-
- do {
- if (cl->level > q->toplevel) {
- cl = NULL;
- break;
- }
- } while ((cl = cl->borrow) != NULL);
-
- if (cl == NULL)
- cl = this;
- cbq_ovl_classic(cl);
-}
-
-/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
-
-static void cbq_ovl_delay(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- psched_tdiff_t delay = cl->undertime - q->now;
-
- if (test_bit(__QDISC_STATE_DEACTIVATED,
- &qdisc_root_sleeping(cl->qdisc)->state))
- return;
-
- if (!cl->delayed) {
- psched_time_t sched = q->now;
- ktime_t expires;
-
- delay += cl->offtime;
- if (cl->avgidle < 0)
- delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
- if (cl->avgidle < cl->minidle)
- cl->avgidle = cl->minidle;
- cl->undertime = q->now + delay;
-
- if (delay > 0) {
- sched += delay + cl->penalty;
- cl->penalized = sched;
- cl->cpriority = TC_CBQ_MAXPRIO;
- q->pmask |= (1<<TC_CBQ_MAXPRIO);
-
- expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
- if (hrtimer_try_to_cancel(&q->delay_timer) &&
- ktime_to_ns(ktime_sub(
- hrtimer_get_expires(&q->delay_timer),
- expires)) > 0)
- hrtimer_set_expires(&q->delay_timer, expires);
- hrtimer_restart(&q->delay_timer);
- cl->delayed = 1;
- cl->xstats.overactions++;
- return;
- }
- delay = 1;
- }
- if (q->wd_expires == 0 || q->wd_expires > delay)
- q->wd_expires = delay;
-}
-
-/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
-
-static void cbq_ovl_lowprio(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-
- cl->penalized = q->now + cl->penalty;
-
- if (cl->cpriority != cl->priority2) {
- cl->cpriority = cl->priority2;
- q->pmask |= (1<<cl->cpriority);
- cl->xstats.overactions++;
- }
- cbq_ovl_classic(cl);
-}
-
-/* TC_CBQ_OVL_DROP: penalize class by dropping */
-
-static void cbq_ovl_drop(struct cbq_class *cl)
-{
- if (cl->q->ops->drop)
- if (cl->q->ops->drop(cl->q))
- cl->qdisc->q.qlen--;
- cl->xstats.overactions++;
- cbq_ovl_classic(cl);
-}
-
static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
psched_time_t now)
{
@@ -620,45 +514,10 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
}
- qdisc_unthrottled(sch);
__netif_schedule(qdisc_root(sch));
return HRTIMER_NORESTART;
}
-#ifdef CONFIG_NET_CLS_ACT
-static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
-{
- struct Qdisc *sch = child->__parent;
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = q->rx_class;
-
- q->rx_class = NULL;
-
- if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
- int ret;
-
- cbq_mark_toplevel(q, cl);
-
- q->rx_class = cl;
- cl->q->__parent = sch;
-
- ret = qdisc_enqueue(skb, cl->q);
- if (ret == NET_XMIT_SUCCESS) {
- sch->q.qlen++;
- if (!cl->next_alive)
- cbq_activate_class(cl);
- return 0;
- }
- if (net_xmit_drop_count(ret))
- qdisc_qstats_drop(sch);
- return 0;
- }
-
- qdisc_qstats_drop(sch);
- return -1;
-}
-#endif
-
/*
* It is mission critical procedure.
*
@@ -807,7 +666,7 @@ cbq_under_limit(struct cbq_class *cl)
cl = cl->borrow;
if (!cl) {
this_cl->qstats.overlimits++;
- this_cl->overlimit(this_cl);
+ cbq_overlimit(this_cl);
return NULL;
}
if (cl->level > q->toplevel)
@@ -960,7 +819,6 @@ cbq_dequeue(struct Qdisc *sch)
if (skb) {
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- qdisc_unthrottled(sch);
return skb;
}
@@ -1166,31 +1024,6 @@ static void cbq_link_class(struct cbq_class *this)
}
}
-static unsigned int cbq_drop(struct Qdisc *sch)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl, *cl_head;
- int prio;
- unsigned int len;
-
- for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
- cl_head = q->active[prio];
- if (!cl_head)
- continue;
-
- cl = cl_head;
- do {
- if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
- sch->q.qlen--;
- if (!cl->q->q.qlen)
- cbq_deactivate_class(cl);
- return len;
- }
- } while ((cl = cl->next_alive) != cl_head);
- }
- return 0;
-}
-
static void
cbq_reset(struct Qdisc *sch)
{
@@ -1280,50 +1113,6 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
return 0;
}
-static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
-{
- switch (ovl->strategy) {
- case TC_CBQ_OVL_CLASSIC:
- cl->overlimit = cbq_ovl_classic;
- break;
- case TC_CBQ_OVL_DELAY:
- cl->overlimit = cbq_ovl_delay;
- break;
- case TC_CBQ_OVL_LOWPRIO:
- if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
- ovl->priority2 - 1 <= cl->priority)
- return -EINVAL;
- cl->priority2 = ovl->priority2 - 1;
- cl->overlimit = cbq_ovl_lowprio;
- break;
- case TC_CBQ_OVL_DROP:
- cl->overlimit = cbq_ovl_drop;
- break;
- case TC_CBQ_OVL_RCLASSIC:
- cl->overlimit = cbq_ovl_rclassic;
- break;
- default:
- return -EINVAL;
- }
- cl->penalty = ovl->penalty;
- return 0;
-}
-
-#ifdef CONFIG_NET_CLS_ACT
-static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
-{
- cl->police = p->police;
-
- if (cl->q->handle) {
- if (p->police == TC_POLICE_RECLASSIFY)
- cl->q->reshape_fail = cbq_reshape_fail;
- else
- cl->q->reshape_fail = NULL;
- }
- return 0;
-}
-#endif
-
static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
{
cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
@@ -1375,8 +1164,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
q->link.priority = TC_CBQ_MAXPRIO - 1;
q->link.priority2 = TC_CBQ_MAXPRIO - 1;
q->link.cpriority = TC_CBQ_MAXPRIO - 1;
- q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
- q->link.overlimit = cbq_ovl_classic;
q->link.allot = psched_mtu(qdisc_dev(sch));
q->link.quantum = q->link.allot;
q->link.weight = q->link.R_tab->rate.rate;
@@ -1463,24 +1250,6 @@ nla_put_failure:
return -1;
}
-static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cbq_ovl opt;
-
- opt.strategy = cl->ovl_strategy;
- opt.priority2 = cl->priority2 + 1;
- opt.pad = 0;
- opt.penalty = cl->penalty;
- if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
@@ -1500,36 +1269,11 @@ nla_put_failure:
return -1;
}
-#ifdef CONFIG_NET_CLS_ACT
-static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cbq_police opt;
-
- if (cl->police) {
- opt.police = cl->police;
- opt.__res1 = 0;
- opt.__res2 = 0;
- if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
- goto nla_put_failure;
- }
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-#endif
-
static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
{
if (cbq_dump_lss(skb, cl) < 0 ||
cbq_dump_rate(skb, cl) < 0 ||
cbq_dump_wrr(skb, cl) < 0 ||
- cbq_dump_ovl(skb, cl) < 0 ||
-#ifdef CONFIG_NET_CLS_ACT
- cbq_dump_police(skb, cl) < 0 ||
-#endif
cbq_dump_fopt(skb, cl) < 0)
return -1;
return 0;
@@ -1600,7 +1344,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (cl->undertime != PSCHED_PASTPERFECT)
cl->xstats.undertime = cl->undertime - q->now;
- if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
return -1;
@@ -1618,11 +1363,6 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
&pfifo_qdisc_ops, cl->common.classid);
if (new == NULL)
return -ENOBUFS;
- } else {
-#ifdef CONFIG_NET_CLS_ACT
- if (cl->police == TC_POLICE_RECLASSIFY)
- new->reshape_fail = cbq_reshape_fail;
-#endif
}
*old = qdisc_replace(sch, new, &cl->q);
@@ -1735,6 +1475,9 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (err < 0)
return err;
+ if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
+ return -EOPNOTSUPP;
+
if (cl) {
/* Check parent */
if (parentid) {
@@ -1755,7 +1498,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
qdisc_put_rtab(rtab);
@@ -1782,14 +1526,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
}
- if (tb[TCA_CBQ_OVL_STRATEGY])
- cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
-
-#ifdef CONFIG_NET_CLS_ACT
- if (tb[TCA_CBQ_POLICE])
- cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
-#endif
-
if (tb[TCA_CBQ_FOPT])
cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
@@ -1848,7 +1584,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
kfree(cl);
@@ -1884,13 +1621,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
cl->maxidle = q->link.maxidle;
if (cl->avpkt == 0)
cl->avpkt = q->link.avpkt;
- cl->overlimit = cbq_ovl_classic;
- if (tb[TCA_CBQ_OVL_STRATEGY])
- cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
-#ifdef CONFIG_NET_CLS_ACT
- if (tb[TCA_CBQ_POLICE])
- cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
-#endif
if (tb[TCA_CBQ_FOPT])
cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
sch_tree_unlock(sch);
@@ -2035,7 +1765,6 @@ static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
.enqueue = cbq_enqueue,
.dequeue = cbq_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = cbq_drop,
.init = cbq_init,
.reset = cbq_reset,
.destroy = cbq_destroy,
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 0a08c860e..3b6d5bd69 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -115,7 +115,8 @@ static void choke_zap_tail_holes(struct choke_sched_data *q)
}
/* Drop packet from queue array by creating a "hole" */
-static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
+ struct sk_buff **to_free)
{
struct choke_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb = q->tab[idx];
@@ -129,7 +130,7 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
qdisc_qstats_backlog_dec(sch, skb);
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
--sch->q.qlen;
}
@@ -261,7 +262,8 @@ static bool choke_match_random(const struct choke_sched_data *q,
return choke_match_flow(oskb, nskb);
}
-static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
struct choke_sched_data *q = qdisc_priv(sch);
@@ -288,7 +290,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
/* Draw a packet at random from queue and compare flow */
if (choke_match_random(q, skb, &idx)) {
q->stats.matched++;
- choke_drop_by_idx(sch, idx);
+ choke_drop_by_idx(sch, idx, to_free);
goto congestion_drop;
}
@@ -331,16 +333,16 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
q->stats.pdrop++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
congestion_drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
other_drop:
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
@@ -365,22 +367,6 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch)
return skb;
}
-static unsigned int choke_drop(struct Qdisc *sch)
-{
- struct choke_sched_data *q = qdisc_priv(sch);
- unsigned int len;
-
- len = qdisc_queue_drop(sch);
- if (len > 0)
- q->stats.other++;
- else {
- if (!red_is_idling(&q->vars))
- red_start_of_idle_period(&q->vars);
- }
-
- return len;
-}
-
static void choke_reset(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
@@ -391,11 +377,11 @@ static void choke_reset(struct Qdisc *sch)
q->head = (q->head + 1) & q->tab_mask;
if (!skb)
continue;
- qdisc_qstats_backlog_dec(sch, skb);
- --sch->q.qlen;
- qdisc_drop(skb, sch);
+ rtnl_qdisc_drop(skb, sch);
}
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
q->head = q->tail = 0;
red_restart(&q->vars);
@@ -471,7 +457,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
--sch->q.qlen;
- qdisc_drop(skb, sch);
+ rtnl_qdisc_drop(skb, sch);
}
qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
q->head = 0;
@@ -569,7 +555,6 @@ static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
.enqueue = choke_enqueue,
.dequeue = choke_dequeue,
.peek = choke_peek_head,
- .drop = choke_drop,
.init = choke_init,
.destroy = choke_destroy,
.reset = choke_reset,
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index dddf3bb65..4002df3c7 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -82,7 +82,8 @@ static void drop_func(struct sk_buff *skb, void *ctx)
{
struct Qdisc *sch = ctx;
- qdisc_drop(skb, sch);
+ kfree_skb(skb);
+ qdisc_qstats_drop(sch);
}
static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
@@ -107,7 +108,8 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
return skb;
}
-static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct codel_sched_data *q;
@@ -117,7 +119,7 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
q = qdisc_priv(sch);
q->drop_overlimit++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
@@ -174,7 +176,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
- qdisc_drop(skb, sch);
+ rtnl_qdisc_drop(skb, sch);
}
qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index bf8af2c43..8af5c59ee 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -91,7 +91,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err)
return err;
@@ -119,7 +120,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
qdisc_destroy(cl->qdisc);
@@ -279,7 +281,8 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (qlen)
xstats.deficit = cl->deficit;
- if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
return -1;
@@ -347,7 +350,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
return NULL;
}
-static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
@@ -357,11 +361,11 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl == NULL) {
if (err & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return err;
}
- err = qdisc_enqueue(skb, cl->qdisc);
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
cl->qstats.drops++;
@@ -420,27 +424,6 @@ out:
return NULL;
}
-static unsigned int drr_drop(struct Qdisc *sch)
-{
- struct drr_sched *q = qdisc_priv(sch);
- struct drr_class *cl;
- unsigned int len;
-
- list_for_each_entry(cl, &q->active, alist) {
- if (cl->qdisc->ops->drop) {
- len = cl->qdisc->ops->drop(cl->qdisc);
- if (len > 0) {
- sch->qstats.backlog -= len;
- sch->q.qlen--;
- if (cl->qdisc->q.qlen == 0)
- list_del(&cl->alist);
- return len;
- }
- }
- }
- return 0;
-}
-
static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
{
struct drr_sched *q = qdisc_priv(sch);
@@ -510,7 +493,6 @@ static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
.enqueue = drr_enqueue,
.dequeue = drr_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = drr_drop,
.init = drr_init_qdisc,
.reset = drr_reset_qdisc,
.destroy = drr_destroy_qdisc,
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 34b4ddaca..1308bbf46 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -191,7 +191,8 @@ static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch,
/* --------------------------- Qdisc operations ---------------------------- */
-static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int err;
@@ -234,7 +235,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
#ifdef CONFIG_NET_CLS_ACT
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
@@ -251,7 +252,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
}
- err = qdisc_enqueue(skb, p->q);
+ err = qdisc_enqueue(skb, p->q, to_free);
if (err != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(err))
qdisc_qstats_drop(sch);
@@ -264,7 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return NET_XMIT_SUCCESS;
drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
@@ -320,23 +321,6 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch)
return p->q->ops->peek(p->q);
}
-static unsigned int dsmark_drop(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- unsigned int len;
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
- if (p->q->ops->drop == NULL)
- return 0;
-
- len = p->q->ops->drop(p->q);
- if (len)
- sch->q.qlen--;
-
- return len;
-}
-
static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
@@ -489,7 +473,6 @@ static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
.enqueue = dsmark_enqueue,
.dequeue = dsmark_dequeue,
.peek = dsmark_peek,
- .drop = dsmark_drop,
.init = dsmark_init,
.reset = dsmark_reset,
.destroy = dsmark_destroy,
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 2e4bd2c0a..baeed6a78 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -19,23 +19,26 @@
/* 1 band FIFO pseudo-"scheduler" */
-static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
return qdisc_enqueue_tail(skb, sch);
- return qdisc_reshape_fail(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
-static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
- return qdisc_reshape_fail(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
-static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
unsigned int prev_backlog;
@@ -44,7 +47,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
prev_backlog = sch->qstats.backlog;
/* queue full, remove one skb to fulfill the limit */
- __qdisc_queue_drop_head(sch, &sch->q);
+ __qdisc_queue_drop_head(sch, &sch->q, to_free);
qdisc_qstats_drop(sch);
qdisc_enqueue_tail(skb, sch);
@@ -103,7 +106,6 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.enqueue = pfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
- .drop = qdisc_queue_drop,
.init = fifo_init,
.reset = qdisc_reset_queue,
.change = fifo_init,
@@ -118,7 +120,6 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
.enqueue = bfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
- .drop = qdisc_queue_drop,
.init = fifo_init,
.reset = qdisc_reset_queue,
.change = fifo_init,
@@ -133,7 +134,6 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
.enqueue = pfifo_tail_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
- .drop = qdisc_queue_drop_head,
.init = fifo_init,
.reset = qdisc_reset_queue,
.change = fifo_init,
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 3c6a47d66..e5458b99e 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -368,18 +368,19 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
}
}
-static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct fq_flow *f;
if (unlikely(sch->q.qlen >= sch->limit))
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
f = fq_classify(skb, q);
if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
q->stat_flows_plimit++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
f->qlen++;
@@ -445,8 +446,7 @@ begin:
if (!head->first) {
if (q->time_next_delayed_flow != ~0ULL)
qdisc_watchdog_schedule_ns(&q->watchdog,
- q->time_next_delayed_flow,
- false);
+ q->time_next_delayed_flow);
return NULL;
}
}
@@ -515,17 +515,25 @@ out:
return skb;
}
+static void fq_flow_purge(struct fq_flow *flow)
+{
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+ flow->qlen = 0;
+}
+
static void fq_reset(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct rb_root *root;
- struct sk_buff *skb;
struct rb_node *p;
struct fq_flow *f;
unsigned int idx;
- while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
- kfree_skb(skb);
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+
+ fq_flow_purge(&q->internal);
if (!q->fq_root)
return;
@@ -536,8 +544,7 @@ static void fq_reset(struct Qdisc *sch)
f = container_of(p, struct fq_flow, fq_node);
rb_erase(p, root);
- while ((skb = fq_dequeue_head(sch, f)) != NULL)
- kfree_skb(skb);
+ fq_flow_purge(f);
kmem_cache_free(fq_flow_cachep, f);
}
@@ -738,7 +745,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
if (!skb)
break;
drop_len += qdisc_pkt_len(skb);
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
drop_count++;
}
qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index da250b2e0..a5ea0e9b6 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -139,7 +139,8 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
skb->next = NULL;
}
-static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
+ struct sk_buff **to_free)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
@@ -171,8 +172,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
do {
skb = dequeue_head(flow);
len += qdisc_pkt_len(skb);
- mem += skb->truesize;
- kfree_skb(skb);
+ mem += get_codel_cb(skb)->mem_usage;
+ __qdisc_drop(skb, to_free);
} while (++i < max_packets && len < threshold);
flow->dropped += i;
@@ -184,16 +185,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
return idx;
}
-static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
-{
- unsigned int prev_backlog;
-
- prev_backlog = sch->qstats.backlog;
- fq_codel_drop(sch, 1U);
- return prev_backlog - sch->qstats.backlog;
-}
-
-static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
unsigned int idx, prev_backlog, prev_qlen;
@@ -206,7 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (idx == 0) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
idx--;
@@ -223,7 +216,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
flow->deficit = q->quantum;
flow->dropped = 0;
}
- q->memory_usage += skb->truesize;
+ get_codel_cb(skb)->mem_usage = skb->truesize;
+ q->memory_usage += get_codel_cb(skb)->mem_usage;
memory_limited = q->memory_usage > q->memory_limit;
if (++sch->q.qlen <= sch->limit && !memory_limited)
return NET_XMIT_SUCCESS;
@@ -238,7 +232,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
* So instead of dropping a single packet, drop half of its backlog
* with a 64 packets limit to not add a too big cpu spike here.
*/
- ret = fq_codel_drop(sch, q->drop_batch_size);
+ ret = fq_codel_drop(sch, q->drop_batch_size, to_free);
prev_qlen -= sch->q.qlen;
prev_backlog -= sch->qstats.backlog;
@@ -274,7 +268,7 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
if (flow->head) {
skb = dequeue_head(flow);
q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
- q->memory_usage -= skb->truesize;
+ q->memory_usage -= get_codel_cb(skb)->mem_usage;
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
}
@@ -285,7 +279,8 @@ static void drop_func(struct sk_buff *skb, void *ctx)
{
struct Qdisc *sch = ctx;
- qdisc_drop(skb, sch);
+ kfree_skb(skb);
+ qdisc_qstats_drop(sch);
}
static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
@@ -345,6 +340,12 @@ begin:
return skb;
}
+static void fq_codel_flow_purge(struct fq_codel_flow *flow)
+{
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+}
+
static void fq_codel_reset(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
@@ -355,18 +356,13 @@ static void fq_codel_reset(struct Qdisc *sch)
for (i = 0; i < q->flows_cnt; i++) {
struct fq_codel_flow *flow = q->flows + i;
- while (flow->head) {
- struct sk_buff *skb = dequeue_head(flow);
-
- qdisc_qstats_backlog_dec(sch, skb);
- kfree_skb(skb);
- }
-
+ fq_codel_flow_purge(flow);
INIT_LIST_HEAD(&flow->flowchain);
codel_vars_init(&flow->cvars);
}
memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));
sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
q->memory_usage = 0;
}
@@ -442,7 +438,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
struct sk_buff *skb = fq_codel_dequeue(sch);
q->cstats.drop_len += qdisc_pkt_len(skb);
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
q->cstats.drop_count++;
}
qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len);
@@ -578,11 +574,13 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.qdisc_stats.memory_usage = q->memory_usage;
st.qdisc_stats.drop_overmemory = q->drop_overmemory;
+ sch_tree_lock(sch);
list_for_each(pos, &q->new_flows)
st.qdisc_stats.new_flows_len++;
list_for_each(pos, &q->old_flows)
st.qdisc_stats.old_flows_len++;
+ sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
}
@@ -636,7 +634,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
if (idx < q->flows_cnt) {
const struct fq_codel_flow *flow = &q->flows[idx];
- const struct sk_buff *skb = flow->head;
+ const struct sk_buff *skb;
memset(&xstats, 0, sizeof(xstats));
xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
@@ -654,9 +652,14 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
codel_time_to_us(delta) :
-codel_time_to_us(-delta);
}
- while (skb) {
- qs.qlen++;
- skb = skb->next;
+ if (flow->head) {
+ sch_tree_lock(sch);
+ skb = flow->head;
+ while (skb) {
+ qs.qlen++;
+ skb = skb->next;
+ }
+ sch_tree_unlock(sch);
}
qs.backlog = q->backlogs[idx];
qs.drops = flow->dropped;
@@ -709,7 +712,6 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
.enqueue = fq_codel_enqueue,
.dequeue = fq_codel_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = fq_codel_qdisc_drop,
.init = fq_codel_init,
.reset = fq_codel_reset,
.destroy = fq_codel_destroy,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f9e0e9c03..657c13362 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -77,6 +77,34 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
skb->next = NULL;
}
+/* This variant of try_bulk_dequeue_skb() makes sure
+ * all skbs in the chain are for the same txq
+ */
+static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
+ struct sk_buff *skb,
+ int *packets)
+{
+ int mapping = skb_get_queue_mapping(skb);
+ struct sk_buff *nskb;
+ int cnt = 0;
+
+ do {
+ nskb = q->dequeue(q);
+ if (!nskb)
+ break;
+ if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
+ q->skb_bad_txq = nskb;
+ qdisc_qstats_backlog_inc(q, nskb);
+ q->q.qlen++;
+ break;
+ }
+ skb->next = nskb;
+ skb = nskb;
+ } while (++cnt < 8);
+ (*packets) += cnt;
+ skb->next = NULL;
+}
+
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
* A requeued skb (via q->gso_skb) can also be a SKB list.
*/
@@ -87,8 +115,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
const struct netdev_queue *txq = q->dev_queue;
*packets = 1;
- *validate = true;
if (unlikely(skb)) {
+ /* skb in gso_skb were already validated */
+ *validate = false;
/* check the reason of requeuing without tx lock first */
txq = skb_get_tx_queue(txq->dev, skb);
if (!netif_xmit_frozen_or_stopped(txq)) {
@@ -97,22 +126,37 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
q->q.qlen--;
} else
skb = NULL;
- /* skb in gso_skb were already validated */
- *validate = false;
- } else {
- if (!(q->flags & TCQ_F_ONETXQUEUE) ||
- !netif_xmit_frozen_or_stopped(txq)) {
- skb = q->dequeue(q);
- if (skb && qdisc_may_bulk(q))
- try_bulk_dequeue_skb(q, skb, txq, packets);
+ return skb;
+ }
+ *validate = true;
+ skb = q->skb_bad_txq;
+ if (unlikely(skb)) {
+ /* check the reason of requeuing without tx lock first */
+ txq = skb_get_tx_queue(txq->dev, skb);
+ if (!netif_xmit_frozen_or_stopped(txq)) {
+ q->skb_bad_txq = NULL;
+ qdisc_qstats_backlog_dec(q, skb);
+ q->q.qlen--;
+ goto bulk;
}
+ return NULL;
+ }
+ if (!(q->flags & TCQ_F_ONETXQUEUE) ||
+ !netif_xmit_frozen_or_stopped(txq))
+ skb = q->dequeue(q);
+ if (skb) {
+bulk:
+ if (qdisc_may_bulk(q))
+ try_bulk_dequeue_skb(q, skb, txq, packets);
+ else
+ try_bulk_dequeue_skb_slow(q, skb, packets);
}
return skb;
}
/*
* Transmit possibly several skbs, and handle the return status as
- * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
+ * required. Owning running seqcount bit guarantees that
* only one CPU can execute this function.
*
* Returns to the caller:
@@ -165,7 +209,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
/*
* NOTE: Called under qdisc_lock(q) with locally disabled BH.
*
- * __QDISC___STATE_RUNNING guarantees only one CPU can process
+ * running seqcount guarantees only one CPU can process
* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
* this queue.
*
@@ -348,9 +392,10 @@ EXPORT_SYMBOL(netif_carrier_off);
cheaper.
*/
-static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
+ struct sk_buff **to_free)
{
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return NET_XMIT_CN;
}
@@ -381,6 +426,7 @@ struct Qdisc noop_qdisc = {
.list = LIST_HEAD_INIT(noop_qdisc.list),
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
.dev_queue = &noop_netdev_queue,
+ .running = SEQCNT_ZERO(noop_qdisc.running),
.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
};
EXPORT_SYMBOL(noop_qdisc);
@@ -438,7 +484,8 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
return priv->q + band;
}
-static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
+ struct sk_buff **to_free)
{
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
@@ -450,7 +497,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
return __qdisc_enqueue_tail(skb, qdisc, list);
}
- return qdisc_drop(skb, qdisc);
+ return qdisc_drop(skb, qdisc, to_free);
}
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
@@ -492,7 +539,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
- __qdisc_reset_queue(qdisc, band2list(priv, prio));
+ __qdisc_reset_queue(band2list(priv, prio));
priv->bitmap = 0;
qdisc->qstats.backlog = 0;
@@ -539,6 +586,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
EXPORT_SYMBOL(pfifo_fast_ops);
static struct lock_class_key qdisc_tx_busylock;
+static struct lock_class_key qdisc_running_key;
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops)
@@ -572,6 +620,10 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
lockdep_set_class(&sch->busylock,
dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+ seqcount_init(&sch->running);
+ lockdep_set_class(&sch->running,
+ dev->qdisc_running_key ?: &qdisc_running_key);
+
sch->ops = ops;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
@@ -591,18 +643,19 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
struct Qdisc *sch;
if (!try_module_get(ops->owner))
- goto errout;
+ return NULL;
sch = qdisc_alloc(dev_queue, ops);
- if (IS_ERR(sch))
- goto errout;
+ if (IS_ERR(sch)) {
+ module_put(ops->owner);
+ return NULL;
+ }
sch->parent = parentid;
if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
qdisc_destroy(sch);
-errout:
return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);
@@ -616,11 +669,14 @@ void qdisc_reset(struct Qdisc *qdisc)
if (ops->reset)
ops->reset(qdisc);
+ kfree_skb(qdisc->skb_bad_txq);
+ qdisc->skb_bad_txq = NULL;
+
if (qdisc->gso_skb) {
kfree_skb_list(qdisc->gso_skb);
qdisc->gso_skb = NULL;
- qdisc->q.qlen = 0;
}
+ qdisc->q.qlen = 0;
}
EXPORT_SYMBOL(qdisc_reset);
@@ -659,6 +715,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
dev_put(qdisc_dev(qdisc));
kfree_skb_list(qdisc->gso_skb);
+ kfree_skb(qdisc->skb_bad_txq);
/*
* gen_estimator est_timer() might access qdisc->q.lock,
* wait a RCU grace period before freeing qdisc.
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 80105109f..c78a093c5 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -149,7 +149,8 @@ static inline int gred_use_harddrop(struct gred_sched *t)
return t->red_flags & TC_RED_HARDDROP;
}
-static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct gred_sched_data *q = NULL;
struct gred_sched *t = qdisc_priv(sch);
@@ -237,10 +238,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->stats.pdrop++;
drop:
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
congestion_drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
}
@@ -276,40 +277,6 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch)
return NULL;
}
-static unsigned int gred_drop(struct Qdisc *sch)
-{
- struct sk_buff *skb;
- struct gred_sched *t = qdisc_priv(sch);
-
- skb = qdisc_dequeue_tail(sch);
- if (skb) {
- unsigned int len = qdisc_pkt_len(skb);
- struct gred_sched_data *q;
- u16 dp = tc_index_to_dp(skb);
-
- if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
- net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n",
- tc_index_to_dp(skb));
- } else {
- q->backlog -= len;
- q->stats.other++;
-
- if (gred_wred_mode(t)) {
- if (!sch->qstats.backlog)
- red_start_of_idle_period(&t->wred_set);
- } else {
- if (!q->backlog)
- red_start_of_idle_period(&q->vars);
- }
- }
-
- qdisc_drop(skb, sch);
- return len;
- }
-
- return 0;
-}
-
static void gred_reset(struct Qdisc *sch)
{
int i;
@@ -623,7 +590,6 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
.enqueue = gred_enqueue,
.dequeue = gred_dequeue,
.peek = qdisc_peek_head,
- .drop = gred_drop,
.init = gred_init,
.reset = gred_reset,
.destroy = gred_destroy,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 1ac9f9f03..3ddc7bd74 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -115,9 +115,9 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est64 rate_est;
- unsigned int level; /* class level in hierarchy */
struct tcf_proto __rcu *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
+ unsigned int level; /* class level in hierarchy */
struct hfsc_sched *sched; /* scheduler data */
struct hfsc_class *cl_parent; /* parent class */
@@ -130,7 +130,6 @@ struct hfsc_class {
struct rb_node vt_node; /* parent's vt_tree member */
struct rb_root cf_tree; /* active children sorted by cl_f */
struct rb_node cf_node; /* parent's cf_heap member */
- struct list_head dlist; /* drop list member */
u64 cl_total; /* total work in bytes */
u64 cl_cumul; /* cumulative work in bytes done by
@@ -166,10 +165,10 @@ struct hfsc_class {
struct runtime_sc cl_virtual; /* virtual curve */
struct runtime_sc cl_ulimit; /* upperlimit curve */
- unsigned long cl_flags; /* which curves are valid */
- unsigned long cl_vtperiod; /* vt period sequence number */
- unsigned long cl_parentperiod;/* parent's vt period sequence number*/
- unsigned long cl_nactive; /* number of active children */
+ u8 cl_flags; /* which curves are valid */
+ u32 cl_vtperiod; /* vt period sequence number */
+ u32 cl_parentperiod;/* parent's vt period sequence number*/
+ u32 cl_nactive; /* number of active children */
};
struct hfsc_sched {
@@ -177,8 +176,6 @@ struct hfsc_sched {
struct hfsc_class root; /* root class */
struct Qdisc_class_hash clhash; /* class hash */
struct rb_root eligible; /* eligible tree */
- struct list_head droplist; /* active leaf class list (for
- dropping) */
struct qdisc_watchdog watchdog; /* watchdog timer */
};
@@ -781,6 +778,20 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
else
go_passive = 0;
+ /* update vt */
+ cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
+ - cl->cl_vtoff + cl->cl_vtadj;
+
+ /*
+ * if vt of the class is smaller than cvtmin,
+ * the class was skipped in the past due to non-fit.
+ * if so, we need to adjust vtadj.
+ */
+ if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
+ cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+ cl->cl_vt = cl->cl_parent->cl_cvtmin;
+ }
+
if (go_passive) {
/* no more active child, going passive */
@@ -797,25 +808,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
continue;
}
- /*
- * update vt and f
- */
- cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
- - cl->cl_vtoff + cl->cl_vtadj;
-
- /*
- * if vt of the class is smaller than cvtmin,
- * the class was skipped in the past due to non-fit.
- * if so, we need to adjust vtadj.
- */
- if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
- cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
- cl->cl_vt = cl->cl_parent->cl_cvtmin;
- }
-
/* update the vt tree */
vttree_update(cl);
+ /* update f */
if (cl->cl_flags & HFSC_USC) {
cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
cl->cl_total);
@@ -858,7 +854,6 @@ set_active(struct hfsc_class *cl, unsigned int len)
if (cl->cl_flags & HFSC_FSC)
init_vf(cl, len);
- list_add_tail(&cl->dlist, &cl->sched->droplist);
}
static void
@@ -867,8 +862,6 @@ set_passive(struct hfsc_class *cl)
if (cl->cl_flags & HFSC_RSC)
eltree_remove(cl);
- list_del(&cl->dlist);
-
/*
* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
* needs to be called explicitly to remove a class from vttree.
@@ -882,7 +875,7 @@ qdisc_peek_len(struct Qdisc *sch)
unsigned int len;
skb = sch->ops->peek(sch);
- if (skb == NULL) {
+ if (unlikely(skb == NULL)) {
qdisc_warn_nonwc("qdisc_peek_len", sch);
return 0;
}
@@ -947,7 +940,7 @@ static void
hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
{
sc2isc(fsc, &cl->cl_fsc);
- rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vtoff + cl->cl_vt, cl->cl_total);
cl->cl_flags |= HFSC_FSC;
}
@@ -1015,11 +1008,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
cur_time = psched_get_time();
if (tca[TCA_RATE]) {
- spinlock_t *lock = qdisc_root_sleeping_lock(sch);
-
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
- lock,
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err)
return err;
@@ -1068,7 +1060,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
kfree(cl);
@@ -1373,7 +1366,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.work = cl->cl_total;
xstats.rtwork = cl->cl_cumul;
- if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
return -1;
@@ -1443,7 +1436,6 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
if (err < 0)
return err;
q->eligible = RB_ROOT;
- INIT_LIST_HEAD(&q->droplist);
q->root.cl_common.classid = sch->handle;
q->root.refcnt = 1;
@@ -1527,7 +1519,6 @@ hfsc_reset_qdisc(struct Qdisc *sch)
hfsc_reset_class(cl);
}
q->eligible = RB_ROOT;
- INIT_LIST_HEAD(&q->droplist);
qdisc_watchdog_cancel(&q->watchdog);
sch->qstats.backlog = 0;
sch->q.qlen = 0;
@@ -1572,7 +1563,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
}
static int
-hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
struct hfsc_class *cl;
int uninitialized_var(err);
@@ -1581,11 +1572,11 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl == NULL) {
if (err & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return err;
}
- err = qdisc_enqueue(skb, cl->qdisc);
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
cl->qstats.drops++;
@@ -1594,8 +1585,17 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return err;
}
- if (cl->qdisc->q.qlen == 1)
+ if (cl->qdisc->q.qlen == 1) {
set_active(cl, qdisc_pkt_len(skb));
+ /*
+ * If this is the first packet, isolate the head so an eventual
+ * head drop before the first dequeue operation has no chance
+ * to invalidate the deadline.
+ */
+ if (cl->cl_flags & HFSC_RSC)
+ cl->qdisc->ops->peek(cl->qdisc);
+
+ }
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
@@ -1664,7 +1664,6 @@ hfsc_dequeue(struct Qdisc *sch)
set_passive(cl);
}
- qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
@@ -1672,32 +1671,6 @@ hfsc_dequeue(struct Qdisc *sch)
return skb;
}
-static unsigned int
-hfsc_drop(struct Qdisc *sch)
-{
- struct hfsc_sched *q = qdisc_priv(sch);
- struct hfsc_class *cl;
- unsigned int len;
-
- list_for_each_entry(cl, &q->droplist, dlist) {
- if (cl->qdisc->ops->drop != NULL &&
- (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
- if (cl->qdisc->q.qlen == 0) {
- update_vf(cl, 0, 0);
- set_passive(cl);
- } else {
- list_move_tail(&cl->dlist, &q->droplist);
- }
- cl->qstats.drops++;
- qdisc_qstats_drop(sch);
- sch->qstats.backlog -= len;
- sch->q.qlen--;
- return len;
- }
- }
- return 0;
-}
-
static const struct Qdisc_class_ops hfsc_class_ops = {
.change = hfsc_change_class,
.delete = hfsc_delete_class,
@@ -1724,7 +1697,6 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = {
.enqueue = hfsc_enqueue,
.dequeue = hfsc_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = hfsc_drop,
.cl_ops = &hfsc_class_ops,
.priv_size = sizeof(struct hfsc_sched),
.owner = THIS_MODULE
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 13d6f83ec..e3d0458af 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -345,7 +345,7 @@ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
skb->next = NULL;
}
-static unsigned int hhf_drop(struct Qdisc *sch)
+static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct wdrr_bucket *bucket;
@@ -359,25 +359,16 @@ static unsigned int hhf_drop(struct Qdisc *sch)
struct sk_buff *skb = dequeue_head(bucket);
sch->q.qlen--;
- qdisc_qstats_drop(sch);
qdisc_qstats_backlog_dec(sch, skb);
- kfree_skb(skb);
+ qdisc_drop(skb, sch, to_free);
}
/* Return id of the bucket from which the packet was dropped. */
return bucket - q->buckets;
}
-static unsigned int hhf_qdisc_drop(struct Qdisc *sch)
-{
- unsigned int prev_backlog;
-
- prev_backlog = sch->qstats.backlog;
- hhf_drop(sch);
- return prev_backlog - sch->qstats.backlog;
-}
-
-static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct hhf_sched_data *q = qdisc_priv(sch);
enum wdrr_bucket_idx idx;
@@ -415,7 +406,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
/* Return Congestion Notification only if we dropped a packet from this
* bucket.
*/
- if (hhf_drop(sch) == idx)
+ if (hhf_drop(sch, to_free) == idx)
return NET_XMIT_CN;
/* As we dropped a packet, better let upper stack know this. */
@@ -473,7 +464,7 @@ static void hhf_reset(struct Qdisc *sch)
struct sk_buff *skb;
while ((skb = hhf_dequeue(sch)) != NULL)
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
}
static void *hhf_zalloc(size_t sz)
@@ -583,7 +574,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = hhf_dequeue(sch);
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
}
qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
prev_backlog - sch->qstats.backlog);
@@ -709,7 +700,6 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
.enqueue = hhf_enqueue,
.dequeue = hhf_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = hhf_qdisc_drop,
.init = hhf_init,
.reset = hhf_reset,
.destroy = hhf_destroy,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 052f84d6c..53dbfa187 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -117,7 +117,6 @@ struct htb_class {
* Written often fields
*/
struct gnet_stats_basic_packed bstats;
- struct gnet_stats_queue qstats;
struct tc_htb_xstats xstats; /* our special stats */
/* token bucket parameters */
@@ -140,6 +139,8 @@ struct htb_class {
enum htb_cmode cmode; /* current mode of the class */
struct rb_node pq_node; /* node for event queue */
struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+
+ unsigned int drops ____cacheline_aligned_in_smp;
};
struct htb_level {
@@ -569,7 +570,8 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
list_del_init(&cl->un.leaf.drop_list);
}
-static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
int uninitialized_var(ret);
struct htb_sched *q = qdisc_priv(sch);
@@ -581,19 +583,20 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
__skb_queue_tail(&q->direct_queue, skb);
q->direct_pkts++;
} else {
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
#ifdef CONFIG_NET_CLS_ACT
} else if (!cl) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
#endif
- } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
+ } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
+ to_free)) != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret)) {
qdisc_qstats_drop(sch);
- cl->qstats.drops++;
+ cl->drops++;
}
return ret;
} else {
@@ -889,7 +892,6 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
if (skb != NULL) {
ok:
qdisc_bstats_update(sch, skb);
- qdisc_unthrottled(sch);
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
@@ -929,38 +931,13 @@ ok:
}
qdisc_qstats_overlimit(sch);
if (likely(next_event > q->now))
- qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true);
+ qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
else
schedule_work(&q->work);
fin:
return skb;
}
-/* try to drop from each class (by prio) until one succeed */
-static unsigned int htb_drop(struct Qdisc *sch)
-{
- struct htb_sched *q = qdisc_priv(sch);
- int prio;
-
- for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
- struct list_head *p;
- list_for_each(p, q->drops + prio) {
- struct htb_class *cl = list_entry(p, struct htb_class,
- un.leaf.drop_list);
- unsigned int len;
- if (cl->un.leaf.q->ops->drop &&
- (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
- sch->qstats.backlog -= len;
- sch->q.qlen--;
- if (!cl->un.leaf.q->q.qlen)
- htb_deactivate(q, cl);
- return len;
- }
- }
- }
- return 0;
-}
-
/* reset all classes */
/* always caled under BH & queue lock */
static void htb_reset(struct Qdisc *sch)
@@ -983,7 +960,7 @@ static void htb_reset(struct Qdisc *sch)
}
}
qdisc_watchdog_cancel(&q->watchdog);
- __skb_queue_purge(&q->direct_queue);
+ __qdisc_reset_queue(&q->direct_queue);
sch->q.qlen = 0;
sch->qstats.backlog = 0;
memset(q->hlevel, 0, sizeof(q->hlevel));
@@ -1136,18 +1113,24 @@ static int
htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
{
struct htb_class *cl = (struct htb_class *)arg;
+ struct gnet_stats_queue qs = {
+ .drops = cl->drops,
+ };
__u32 qlen = 0;
- if (!cl->level && cl->un.leaf.q)
+ if (!cl->level && cl->un.leaf.q) {
qlen = cl->un.leaf.q->q.qlen;
+ qs.backlog = cl->un.leaf.q->qstats.backlog;
+ }
cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
INT_MIN, INT_MAX);
cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
INT_MIN, INT_MAX);
- if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
+ gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
@@ -1260,7 +1243,7 @@ static void htb_destroy(struct Qdisc *sch)
htb_destroy_class(sch, cl);
}
qdisc_class_hash_destroy(&q->clhash);
- __skb_queue_purge(&q->direct_queue);
+ __qdisc_reset_queue(&q->direct_queue);
}
static int htb_delete(struct Qdisc *sch, unsigned long arg)
@@ -1399,7 +1382,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (htb_rate_est || tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL,
&cl->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE] ? : &est.nla);
if (err) {
kfree(cl);
@@ -1461,11 +1445,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
parent->children++;
} else {
if (tca[TCA_RATE]) {
- spinlock_t *lock = qdisc_root_sleeping_lock(sch);
-
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
- lock,
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err)
return err;
@@ -1603,7 +1586,6 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
.enqueue = htb_enqueue,
.dequeue = htb_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = htb_drop,
.init = htb_init,
.reset = htb_reset,
.destroy = htb_destroy,
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 56a77b878..b9439827c 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -199,7 +199,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
+ if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
return -1;
return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b8002ce3d..549c66359 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -342,7 +342,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
* hold here is the look on dev_queue->qdisc_sleeping
* also acquired below.
*/
- spin_unlock_bh(d->lock);
+ if (d->lock)
+ spin_unlock_bh(d->lock);
for (i = tc.offset; i < tc.offset + tc.count; i++) {
struct netdev_queue *q = netdev_get_tx_queue(dev, i);
@@ -359,15 +360,17 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
spin_unlock_bh(qdisc_lock(qdisc));
}
/* Reclaim root sleeping lock before completing stats */
- spin_lock_bh(d->lock);
- if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 ||
+ if (d->lock)
+ spin_lock_bh(d->lock);
+ if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0)
return -1;
} else {
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &sch->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL,
&sch->qstats, sch->q.qlen) < 0)
return -1;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index bcdd54bb1..9ffbb025b 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -65,7 +65,8 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
}
static int
-multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct Qdisc *qdisc;
int ret;
@@ -76,12 +77,12 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
#endif
- ret = qdisc_enqueue(skb, qdisc);
+ ret = qdisc_enqueue(skb, qdisc, to_free);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -151,27 +152,6 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch)
}
-static unsigned int multiq_drop(struct Qdisc *sch)
-{
- struct multiq_sched_data *q = qdisc_priv(sch);
- int band;
- unsigned int len;
- struct Qdisc *qdisc;
-
- for (band = q->bands - 1; band >= 0; band--) {
- qdisc = q->queues[band];
- if (qdisc->ops->drop) {
- len = qdisc->ops->drop(qdisc);
- if (len != 0) {
- sch->q.qlen--;
- return len;
- }
- }
- }
- return 0;
-}
-
-
static void
multiq_reset(struct Qdisc *sch)
{
@@ -356,7 +336,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
- if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl_q->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
return -1;
@@ -415,7 +396,6 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
.enqueue = multiq_enqueue,
.dequeue = multiq_dequeue,
.peek = multiq_peek,
- .drop = multiq_drop,
.init = multiq_init,
.reset = multiq_reset,
.destroy = multiq_destroy,
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 178f1630a..aaaf02175 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -368,9 +368,7 @@ static void tfifo_reset(struct Qdisc *sch)
struct sk_buff *skb = netem_rb_to_skb(p);
rb_erase(p, &q->t_root);
- skb->next = NULL;
- skb->prev = NULL;
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
}
}
@@ -399,7 +397,8 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
* when we statistically choose to corrupt one, we instead segment it, returning
* the first packet to be corrupted, and re-enqueue the remaining frames
*/
-static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
+static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct sk_buff *segs;
netdev_features_t features = netif_skb_features(skb);
@@ -407,7 +406,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
- qdisc_reshape_fail(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NULL;
}
consume_skb(skb);
@@ -420,7 +419,8 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
* NET_XMIT_DROP: queue length didn't change.
* NET_XMIT_SUCCESS: one skb was queued.
*/
-static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct netem_sched_data *q = qdisc_priv(sch);
/* We don't fill cb now as skb_unshare() may invalidate it */
@@ -445,7 +445,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
if (count == 0) {
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
@@ -465,7 +465,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
q->duplicate = 0;
- rootq->enqueue(skb2, rootq);
+ rootq->enqueue(skb2, rootq, to_free);
q->duplicate = dupsave;
}
@@ -477,7 +477,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
*/
if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
if (skb_is_gso(skb)) {
- segs = netem_segment(skb, sch);
+ segs = netem_segment(skb, sch, to_free);
if (!segs)
return NET_XMIT_DROP;
} else {
@@ -487,10 +487,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
skb = segs;
segs = segs->next;
- if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
- (skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))) {
- rc = qdisc_drop(skb, sch);
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (unlikely(!skb)) {
+ qdisc_qstats_drop(sch);
+ goto finish_segs;
+ }
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(skb)) {
+ qdisc_drop(skb, sch, to_free);
goto finish_segs;
}
@@ -499,7 +503,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
- return qdisc_reshape_fail(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
qdisc_qstats_backlog_inc(sch, skb);
@@ -559,7 +563,7 @@ finish_segs:
segs->next = NULL;
qdisc_skb_cb(segs)->pkt_len = segs->len;
last_len = segs->len;
- rc = qdisc_enqueue(segs, sch);
+ rc = qdisc_enqueue(segs, sch, to_free);
if (rc != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(rc))
qdisc_qstats_drop(sch);
@@ -576,50 +580,17 @@ finish_segs:
return NET_XMIT_SUCCESS;
}
-static unsigned int netem_drop(struct Qdisc *sch)
-{
- struct netem_sched_data *q = qdisc_priv(sch);
- unsigned int len;
-
- len = qdisc_queue_drop(sch);
-
- if (!len) {
- struct rb_node *p = rb_first(&q->t_root);
-
- if (p) {
- struct sk_buff *skb = netem_rb_to_skb(p);
-
- rb_erase(p, &q->t_root);
- sch->q.qlen--;
- skb->next = NULL;
- skb->prev = NULL;
- qdisc_qstats_backlog_dec(sch, skb);
- kfree_skb(skb);
- }
- }
- if (!len && q->qdisc && q->qdisc->ops->drop)
- len = q->qdisc->ops->drop(q->qdisc);
- if (len)
- qdisc_qstats_drop(sch);
-
- return len;
-}
-
static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
struct rb_node *p;
- if (qdisc_is_throttled(sch))
- return NULL;
-
tfifo_dequeue:
skb = __skb_dequeue(&sch->q);
if (skb) {
qdisc_qstats_backlog_dec(sch, skb);
deliver:
- qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
return skb;
}
@@ -651,8 +622,11 @@ deliver:
if (q->qdisc) {
unsigned int pkt_len = qdisc_pkt_len(skb);
- int err = qdisc_enqueue(skb, q->qdisc);
+ struct sk_buff *to_free = NULL;
+ int err;
+ err = qdisc_enqueue(skb, q->qdisc, &to_free);
+ kfree_skb_list(to_free);
if (err != NET_XMIT_SUCCESS &&
net_xmit_drop_count(err)) {
qdisc_qstats_drop(sch);
@@ -1143,7 +1117,6 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
.enqueue = netem_enqueue,
.dequeue = netem_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = netem_drop,
.init = netem_init,
.reset = netem_reset,
.destroy = netem_destroy,
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 71ae3b962..a570b0bb2 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -134,7 +134,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
return false;
}
-static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct pie_sched_data *q = qdisc_priv(sch);
bool enqueue = false;
@@ -166,7 +167,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
out:
q->stats.dropped++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
@@ -234,7 +235,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
- qdisc_drop(skb, sch);
+ rtnl_qdisc_drop(skb, sch);
}
qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index 5abfe4467..1c6cbab3e 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -64,6 +64,8 @@ struct plug_sched_data {
*/
bool unplug_indefinite;
+ bool throttled;
+
/* Queue Limit in bytes */
u32 limit;
@@ -86,7 +88,8 @@ struct plug_sched_data {
u32 pkts_to_release;
};
-static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct plug_sched_data *q = qdisc_priv(sch);
@@ -96,14 +99,14 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return qdisc_enqueue_tail(skb, sch);
}
- return qdisc_reshape_fail(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
static struct sk_buff *plug_dequeue(struct Qdisc *sch)
{
struct plug_sched_data *q = qdisc_priv(sch);
- if (qdisc_is_throttled(sch))
+ if (q->throttled)
return NULL;
if (!q->unplug_indefinite) {
@@ -111,7 +114,7 @@ static struct sk_buff *plug_dequeue(struct Qdisc *sch)
/* No more packets to dequeue. Block the queue
* and wait for the next release command.
*/
- qdisc_throttled(sch);
+ q->throttled = true;
return NULL;
}
q->pkts_to_release--;
@@ -141,7 +144,7 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt)
q->limit = ctl->limit;
}
- qdisc_throttled(sch);
+ q->throttled = true;
return 0;
}
@@ -173,7 +176,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt)
q->pkts_last_epoch = q->pkts_current_epoch;
q->pkts_current_epoch = 0;
if (q->unplug_indefinite)
- qdisc_throttled(sch);
+ q->throttled = true;
q->unplug_indefinite = false;
break;
case TCQ_PLUG_RELEASE_ONE:
@@ -182,7 +185,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt)
*/
q->pkts_to_release += q->pkts_last_epoch;
q->pkts_last_epoch = 0;
- qdisc_unthrottled(sch);
+ q->throttled = false;
netif_schedule_queue(sch->dev_queue);
break;
case TCQ_PLUG_RELEASE_INDEFINITE:
@@ -190,7 +193,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt)
q->pkts_to_release = 0;
q->pkts_last_epoch = 0;
q->pkts_current_epoch = 0;
- qdisc_unthrottled(sch);
+ q->throttled = false;
netif_schedule_queue(sch->dev_queue);
break;
case TCQ_PLUG_LIMIT:
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index a356450b7..8f575899a 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -67,7 +67,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
}
static int
-prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
struct Qdisc *qdisc;
int ret;
@@ -83,7 +83,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
#endif
- ret = qdisc_enqueue(skb, qdisc);
+ ret = qdisc_enqueue(skb, qdisc, to_free);
if (ret == NET_XMIT_SUCCESS) {
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
@@ -127,25 +127,6 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch)
}
-static unsigned int prio_drop(struct Qdisc *sch)
-{
- struct prio_sched_data *q = qdisc_priv(sch);
- int prio;
- unsigned int len;
- struct Qdisc *qdisc;
-
- for (prio = q->bands-1; prio >= 0; prio--) {
- qdisc = q->queues[prio];
- if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
- sch->qstats.backlog -= len;
- sch->q.qlen--;
- return len;
- }
- }
- return 0;
-}
-
-
static void
prio_reset(struct Qdisc *sch)
{
@@ -304,7 +285,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
- if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl_q->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
return -1;
@@ -363,7 +345,6 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
.enqueue = prio_enqueue,
.dequeue = prio_dequeue,
.peek = prio_peek,
- .drop = prio_drop,
.init = prio_init,
.reset = prio_reset,
.destroy = prio_destroy,
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index f18857feb..ca0516e6f 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -460,7 +460,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err)
return err;
@@ -486,7 +487,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL,
&cl->rate_est,
- qdisc_root_sleeping_lock(sch),
+ NULL,
+ qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err)
goto destroy_class;
@@ -663,7 +665,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.weight = cl->agg->class_weight;
xstats.lmax = cl->agg->lmax;
- if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL,
&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
@@ -1150,6 +1153,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
if (!skb)
return NULL;
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
qdisc_bstats_update(sch, skb);
@@ -1214,7 +1218,8 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
return agg;
}
-static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl;
@@ -1237,11 +1242,11 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
qdisc_pkt_len(skb));
if (err) {
cl->qstats.drops++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
}
- err = qdisc_enqueue(skb, cl->qdisc);
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
pr_debug("qfq_enqueue: enqueue failed %d\n", err);
if (net_xmit_drop_count(err)) {
@@ -1252,6 +1257,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
bstats_update(&cl->bstats, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
++sch->q.qlen;
agg = cl->agg;
@@ -1422,52 +1428,6 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
qfq_deactivate_class(q, cl);
}
-static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
- struct hlist_head *slot)
-{
- struct qfq_aggregate *agg;
- struct qfq_class *cl;
- unsigned int len;
-
- hlist_for_each_entry(agg, slot, next) {
- list_for_each_entry(cl, &agg->active, alist) {
-
- if (!cl->qdisc->ops->drop)
- continue;
-
- len = cl->qdisc->ops->drop(cl->qdisc);
- if (len > 0) {
- if (cl->qdisc->q.qlen == 0)
- qfq_deactivate_class(q, cl);
-
- return len;
- }
- }
- }
- return 0;
-}
-
-static unsigned int qfq_drop(struct Qdisc *sch)
-{
- struct qfq_sched *q = qdisc_priv(sch);
- struct qfq_group *grp;
- unsigned int i, j, len;
-
- for (i = 0; i <= QFQ_MAX_INDEX; i++) {
- grp = &q->groups[i];
- for (j = 0; j < QFQ_MAX_SLOTS; j++) {
- len = qfq_drop_from_slot(q, &grp->slots[j]);
- if (len > 0) {
- sch->q.qlen--;
- return len;
- }
- }
-
- }
-
- return 0;
-}
-
static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
{
struct qfq_sched *q = qdisc_priv(sch);
@@ -1518,6 +1478,7 @@ static void qfq_reset_qdisc(struct Qdisc *sch)
qdisc_reset(cl->qdisc);
}
}
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
@@ -1562,7 +1523,6 @@ static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
.enqueue = qfq_enqueue,
.dequeue = qfq_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = qfq_drop,
.init = qfq_init_qdisc,
.reset = qfq_reset_qdisc,
.destroy = qfq_destroy_qdisc,
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 91578bdd3..249b2a18a 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -56,7 +56,8 @@ static inline int red_use_harddrop(struct red_sched_data *q)
return q->flags & TC_RED_HARDDROP;
}
-static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -95,7 +96,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
break;
}
- ret = qdisc_enqueue(skb, child);
+ ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
@@ -106,7 +107,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return ret;
congestion_drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
}
@@ -136,26 +137,6 @@ static struct sk_buff *red_peek(struct Qdisc *sch)
return child->ops->peek(child);
}
-static unsigned int red_drop(struct Qdisc *sch)
-{
- struct red_sched_data *q = qdisc_priv(sch);
- struct Qdisc *child = q->qdisc;
- unsigned int len;
-
- if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
- q->stats.other++;
- qdisc_qstats_drop(sch);
- sch->qstats.backlog -= len;
- sch->q.qlen--;
- return len;
- }
-
- if (!red_is_idling(&q->vars))
- red_start_of_idle_period(&q->vars);
-
- return 0;
-}
-
static void red_reset(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
@@ -365,7 +346,6 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = {
.enqueue = red_enqueue,
.dequeue = red_dequeue,
.peek = red_peek,
- .drop = red_drop,
.init = red_init,
.reset = red_reset,
.destroy = red_destroy,
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index c69611640..20a350bd1 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -275,7 +275,8 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
return false;
}
-static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct sfb_sched_data *q = qdisc_priv(sch);
@@ -397,8 +398,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
enqueue:
- ret = qdisc_enqueue(skb, child);
+ ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
increment_qlen(skb, q);
} else if (net_xmit_drop_count(ret)) {
@@ -408,7 +410,7 @@ enqueue:
return ret;
drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
other_drop:
if (ret & __NET_XMIT_BYPASS)
@@ -427,6 +429,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
if (skb) {
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
decrement_qlen(skb, q);
}
@@ -449,6 +452,7 @@ static void sfb_reset(struct Qdisc *sch)
struct sfb_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
q->slot = 0;
q->double_buffering = false;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 498f0a2cb..7f195ed4d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -343,7 +343,7 @@ static int sfq_headdrop(const struct sfq_sched_data *q)
}
static int
-sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned int hash, dropped;
@@ -367,7 +367,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (x == SFQ_EMPTY_SLOT) {
x = q->dep[0].next; /* get a free slot */
if (x >= SFQ_MAX_FLOWS)
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
q->ht[hash] = x;
slot = &q->slots[x];
slot->hash = hash;
@@ -424,14 +424,14 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (slot->qlen >= q->maxdepth) {
congestion_drop:
if (!sfq_headdrop(q))
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
/* We know we have at least one packet in queue */
head = slot_dequeue_head(slot);
delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
sch->qstats.backlog -= delta;
slot->backlog -= delta;
- qdisc_drop(head, sch);
+ qdisc_drop(head, sch, to_free);
slot_queue_add(slot, skb);
return NET_XMIT_CN;
@@ -520,7 +520,7 @@ sfq_reset(struct Qdisc *sch)
struct sk_buff *skb;
while ((skb = sfq_dequeue(sch)) != NULL)
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
}
/*
@@ -896,7 +896,6 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
.enqueue = sfq_enqueue,
.dequeue = sfq_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = sfq_drop,
.init = sfq_init,
.reset = sfq_reset,
.destroy = sfq_destroy,
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 3161e4919..303355c44 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -155,7 +155,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
/* GSO packet is too big, segment it so that tbf can transmit
* each segment in time
*/
-static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *segs, *nskb;
@@ -166,7 +167,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs))
- return qdisc_reshape_fail(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
nb = 0;
while (segs) {
@@ -174,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
segs->next = NULL;
qdisc_skb_cb(segs)->pkt_len = segs->len;
len += segs->len;
- ret = qdisc_enqueue(segs, q->qdisc);
+ ret = qdisc_enqueue(segs, q->qdisc, to_free);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
@@ -190,17 +191,18 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
-static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
if (qdisc_pkt_len(skb) > q->max_size) {
if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
- return tbf_segment(skb, sch);
- return qdisc_reshape_fail(skb, sch);
+ return tbf_segment(skb, sch, to_free);
+ return qdisc_drop(skb, sch, to_free);
}
- ret = qdisc_enqueue(skb, q->qdisc);
+ ret = qdisc_enqueue(skb, q->qdisc, to_free);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
@@ -212,19 +214,6 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return NET_XMIT_SUCCESS;
}
-static unsigned int tbf_drop(struct Qdisc *sch)
-{
- struct tbf_sched_data *q = qdisc_priv(sch);
- unsigned int len = 0;
-
- if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
- sch->qstats.backlog -= len;
- sch->q.qlen--;
- qdisc_qstats_drop(sch);
- }
- return len;
-}
-
static bool tbf_peak_present(const struct tbf_sched_data *q)
{
return q->peak.rate_bytes_ps;
@@ -267,14 +256,12 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
q->ptokens = ptoks;
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
- qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
return skb;
}
qdisc_watchdog_schedule_ns(&q->watchdog,
- now + max_t(long, -toks, -ptoks),
- true);
+ now + max_t(long, -toks, -ptoks));
/* Maybe we have a shorter packet in the queue,
which can be sent now. It sounds cool,
@@ -559,7 +546,6 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
.enqueue = tbf_enqueue,
.dequeue = tbf_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = tbf_drop,
.init = tbf_init,
.reset = tbf_reset,
.destroy = tbf_destroy,
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index e02687185..2cd9b4478 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -77,7 +77,7 @@ struct teql_sched_data {
/* "teql*" qdisc routines */
static int
-teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_sched_data *q = qdisc_priv(sch);
@@ -87,7 +87,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return NET_XMIT_SUCCESS;
}
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
static struct sk_buff *
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 0fca5824a..6c4f7496c 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -11,7 +11,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
transport.o chunk.o sm_make_chunk.o ulpevent.o \
inqueue.o outqueue.o ulpqueue.o \
tsnmap.o bind_addr.o socket.o primitive.o \
- output.o input.o debug.o ssnmap.o auth.o
+ output.o input.o debug.o ssnmap.o auth.o \
+ offload.o
sctp_probe-y := probe.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index e1849f371..1c23060c4 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -268,6 +268,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
goto fail_init;
asoc->active_key_id = ep->active_key_id;
+ asoc->prsctp_enable = ep->prsctp_enable;
/* Save the hmacs and chunks list into this association */
if (ep->auth_hmacs_list)
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 1eb94bf18..0a3dbec0a 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -179,6 +179,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
msg, msg->expires_at, jiffies);
}
+ if (asoc->peer.prsctp_capable &&
+ SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
+ msg->expires_at =
+ jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
+
/* This is the biggest possible DATA chunk that can fit into
* the packet
*/
@@ -335,13 +340,32 @@ errout:
/* Check whether this message has expired. */
int sctp_chunk_abandoned(struct sctp_chunk *chunk)
{
- struct sctp_datamsg *msg = chunk->msg;
+ if (!chunk->asoc->peer.prsctp_capable ||
+ !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) {
+ struct sctp_datamsg *msg = chunk->msg;
+
+ if (!msg->can_abandon)
+ return 0;
+
+ if (time_after(jiffies, msg->expires_at))
+ return 1;
- if (!msg->can_abandon)
return 0;
+ }
- if (time_after(jiffies, msg->expires_at))
+ if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
+ time_after(jiffies, chunk->msg->expires_at)) {
+ if (chunk->sent_count)
+ chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
+ else
+ chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
+ return 1;
+ } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
+ chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
+ chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
return 1;
+ }
+ /* PRIO policy is processed by sendmsg, not here */
return 0;
}
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 9d494e35e..1f0306568 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -163,6 +163,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
*/
ep->auth_hmacs_list = auth_hmacs;
ep->auth_chunk_list = auth_chunks;
+ ep->prsctp_enable = net->sctp.prsctp_enable;
return ep;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index f09332256..1555fb8c6 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -90,17 +90,6 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb)
return 0;
}
-struct sctp_input_cb {
- union {
- struct inet_skb_parm h4;
-#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_skb_parm h6;
-#endif
- } header;
- struct sctp_chunk *chunk;
-};
-#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
-
/*
* This is the routine which IP calls when receiving an SCTP packet.
*/
@@ -123,31 +112,41 @@ int sctp_rcv(struct sk_buff *skb)
__SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS);
- if (skb_linearize(skb))
+ /* If packet is too small to contain a single chunk, let's not
+ * waste time on it anymore.
+ */
+ if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) +
+ skb_transport_offset(skb))
goto discard_it;
- /* Pull up the IP and SCTP headers. */
- __skb_pull(skb, skb_transport_offset(skb));
- if (skb->len < sizeof(struct sctphdr))
+ /* If the packet is fragmented and we need to do crc checking,
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+ if ((!(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) &&
+ skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
goto discard_it;
+ /* Pull up the IP header. */
+ __skb_pull(skb, skb_transport_offset(skb));
+
skb->csum_valid = 0; /* Previous value not applicable */
if (skb_csum_unnecessary(skb))
__skb_decr_checksum_unnecessary(skb);
- else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0)
+ else if (!sctp_checksum_disable &&
+ !(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) &&
+ sctp_rcv_checksum(net, skb) < 0)
goto discard_it;
skb->csum_valid = 1;
- skb_pull(skb, sizeof(struct sctphdr));
-
- /* Make sure we at least have chunk headers worth of data left. */
- if (skb->len < sizeof(struct sctp_chunkhdr))
- goto discard_it;
+ __skb_pull(skb, sizeof(struct sctphdr));
family = ipver2af(ip_hdr(skb)->version);
af = sctp_get_af_specific(family);
if (unlikely(!af))
goto discard_it;
+ SCTP_INPUT_CB(skb)->af = af;
/* Initialize local addresses for lookups. */
af->from_skb(&src, skb, 1);
@@ -659,19 +658,23 @@ out_unlock:
*/
static int sctp_rcv_ootb(struct sk_buff *skb)
{
- sctp_chunkhdr_t *ch;
- __u8 *ch_end;
-
- ch = (sctp_chunkhdr_t *) skb->data;
+ sctp_chunkhdr_t *ch, _ch;
+ int ch_end, offset = 0;
/* Scan through all the chunks in the packet. */
do {
+ /* Make sure we have at least the header there */
+ if (offset + sizeof(sctp_chunkhdr_t) > skb->len)
+ break;
+
+ ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch);
+
/* Break out if chunk length is less then minimal. */
if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
break;
- ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
- if (ch_end > skb_tail_pointer(skb))
+ ch_end = offset + WORD_ROUND(ntohs(ch->length));
+ if (ch_end > skb->len)
break;
/* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the
@@ -696,8 +699,8 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data)
goto discard;
- ch = (sctp_chunkhdr_t *) ch_end;
- } while (ch_end < skb_tail_pointer(skb));
+ offset = ch_end;
+ } while (ch_end < skb->len);
return 0;
@@ -793,27 +796,34 @@ struct sctp_hash_cmp_arg {
static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
+ struct sctp_transport *t = (struct sctp_transport *)ptr;
const struct sctp_hash_cmp_arg *x = arg->key;
- const struct sctp_transport *t = ptr;
- struct sctp_association *asoc = t->asoc;
- const struct net *net = x->net;
+ struct sctp_association *asoc;
+ int err = 1;
if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
- return 1;
- if (!net_eq(sock_net(asoc->base.sk), net))
- return 1;
+ return err;
+ if (!sctp_transport_hold(t))
+ return err;
+
+ asoc = t->asoc;
+ if (!net_eq(sock_net(asoc->base.sk), x->net))
+ goto out;
if (x->ep) {
if (x->ep != asoc->ep)
- return 1;
+ goto out;
} else {
if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
- return 1;
+ goto out;
if (!sctp_bind_addr_match(&asoc->base.bind_addr,
x->laddr, sctp_sk(asoc->base.sk)))
- return 1;
+ goto out;
}
- return 0;
+ err = 0;
+out:
+ sctp_transport_put(t);
+ return err;
}
static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
@@ -1172,6 +1182,14 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
{
sctp_chunkhdr_t *ch;
+ /* We do not allow GSO frames here as we need to linearize and
+ * then cannot guarantee frame boundaries. This shouldn't be an
+ * issue as packets hitting this are mostly INIT or INIT-ACK and
+ * those cannot be on GSO-style anyway.
+ */
+ if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP)
+ return NULL;
+
ch = (sctp_chunkhdr_t *) skb->data;
/* The code below will attempt to walk the chunk and extract
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index b335ffcef..6437aa97c 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -128,13 +128,25 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
* at this time.
*/
- if ((chunk = queue->in_progress)) {
+ chunk = queue->in_progress;
+ if (chunk) {
/* There is a packet that we have been working on.
* Any post processing work to do before we move on?
*/
if (chunk->singleton ||
chunk->end_of_packet ||
chunk->pdiscard) {
+ if (chunk->head_skb == chunk->skb) {
+ chunk->skb = skb_shinfo(chunk->skb)->frag_list;
+ goto new_skb;
+ }
+ if (chunk->skb->next) {
+ chunk->skb = chunk->skb->next;
+ goto new_skb;
+ }
+
+ if (chunk->head_skb)
+ chunk->skb = chunk->head_skb;
sctp_chunk_free(chunk);
chunk = queue->in_progress = NULL;
} else {
@@ -150,34 +162,58 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
if (!chunk) {
struct list_head *entry;
+next_chunk:
/* Is the queue empty? */
- if (list_empty(&queue->in_chunk_list))
+ entry = sctp_list_dequeue(&queue->in_chunk_list);
+ if (!entry)
return NULL;
- entry = queue->in_chunk_list.next;
- chunk = queue->in_progress =
- list_entry(entry, struct sctp_chunk, list);
- list_del_init(entry);
+ chunk = list_entry(entry, struct sctp_chunk, list);
- /* This is the first chunk in the packet. */
- chunk->singleton = 1;
- ch = (sctp_chunkhdr_t *) chunk->skb->data;
- chunk->data_accepted = 0;
+ if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) {
+ /* GSO-marked skbs but without frags, handle
+ * them normally
+ */
+ if (skb_shinfo(chunk->skb)->frag_list)
+ chunk->head_skb = chunk->skb;
+
+ /* skbs with "cover letter" */
+ if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len)
+ chunk->skb = skb_shinfo(chunk->skb)->frag_list;
+
+ if (WARN_ON(!chunk->skb)) {
+ __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS);
+ sctp_chunk_free(chunk);
+ goto next_chunk;
+ }
+ }
if (chunk->asoc)
sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb);
+
+ queue->in_progress = chunk;
+
+new_skb:
+ /* This is the first chunk in the packet. */
+ ch = (sctp_chunkhdr_t *) chunk->skb->data;
+ chunk->singleton = 1;
+ chunk->data_accepted = 0;
+ chunk->pdiscard = 0;
+ chunk->auth = 0;
+ chunk->has_asconf = 0;
+ chunk->end_of_packet = 0;
+ if (chunk->head_skb) {
+ struct sctp_input_cb
+ *cb = SCTP_INPUT_CB(chunk->skb),
+ *head_cb = SCTP_INPUT_CB(chunk->head_skb);
+
+ cb->chunk = head_cb->chunk;
+ cb->af = head_cb->af;
+ }
}
chunk->chunk_hdr = ch;
chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
- /* In the unlikely case of an IP reassembly, the skb could be
- * non-linear. If so, update chunk_end so that it doesn't go past
- * the skb->tail.
- */
- if (unlikely(skb_is_nonlinear(chunk->skb))) {
- if (chunk->chunk_end > skb_tail_pointer(chunk->skb))
- chunk->chunk_end = skb_tail_pointer(chunk->skb);
- }
skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
chunk->subh.v = NULL; /* Subheader is no longer valid. */
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0657d18a8..f473779e8 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -420,6 +420,7 @@ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
addr->v6.sin6_flowinfo = 0; /* FIXME */
addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
+ /* Always called on head skb, so this is safe */
sh = sctp_hdr(skb);
if (is_saddr) {
*port = sh->source;
@@ -559,6 +560,7 @@ static int sctp_v6_is_any(const union sctp_addr *addr)
static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
{
int type;
+ struct net *net = sock_net(&sp->inet.sk);
const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr;
type = ipv6_addr_type(in6);
@@ -573,7 +575,8 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
if (!(type & IPV6_ADDR_UNICAST))
return 0;
- return ipv6_chk_addr(sock_net(&sp->inet.sk), in6, NULL, 0);
+ return sp->inet.freebind || net->ipv6.sysctl.ip_nonlocal_bind ||
+ ipv6_chk_addr(net, in6, NULL, 0);
}
/* This function checks if the address is a valid address to be used for
@@ -710,8 +713,7 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
/* Where did this skb come from? */
static int sctp_v6_skb_iif(const struct sk_buff *skb)
{
- struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb;
- return opt->iif;
+ return IP6CB(skb)->iif;
}
/* Was this packet marked by Explicit Congestion Notification? */
@@ -780,15 +782,14 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname,
if (ip_hdr(skb)->version == 4) {
addr->v4.sin_family = AF_INET;
addr->v4.sin_port = sh->source;
- addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr;
+ addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr;
} else {
addr->v6.sin6_family = AF_INET6;
addr->v6.sin6_flowinfo = 0;
addr->v6.sin6_port = sh->source;
addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
- struct sctp_ulpevent *ev = sctp_skb2event(skb);
- addr->v6.sin6_scope_id = ev->iif;
+ addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb);
}
}
@@ -955,7 +956,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
new file mode 100644
index 000000000..7e869d0cc
--- /dev/null
+++ b/net/sctp/offload.c
@@ -0,0 +1,119 @@
+/*
+ * sctp_offload - GRO/GSO Offloading for SCTP
+ *
+ * Copyright (C) 2015, Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/sctp.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/time.h>
+#include <net/net_namespace.h>
+
+#include <linux/skbuff.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/checksum.h>
+#include <net/protocol.h>
+
+static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
+{
+ skb->ip_summed = CHECKSUM_NONE;
+ return sctp_compute_cksum(skb, skb_transport_offset(skb));
+}
+
+static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct sctphdr *sh;
+
+ sh = sctp_hdr(skb);
+ if (!pskb_may_pull(skb, sizeof(*sh)))
+ goto out;
+
+ __skb_pull(skb, sizeof(*sh));
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
+
+ pinfo->gso_segs = 0;
+ if (skb->len != skb->data_len) {
+ /* Means we have chunks in here too */
+ pinfo->gso_segs++;
+ }
+
+ skb_walk_frags(skb, frag_iter)
+ pinfo->gso_segs++;
+
+ segs = NULL;
+ goto out;
+ }
+
+ segs = skb_segment(skb, features | NETIF_F_HW_CSUM);
+ if (IS_ERR(segs))
+ goto out;
+
+ /* All that is left is update SCTP CRC if necessary */
+ if (!(features & NETIF_F_SCTP_CRC)) {
+ for (skb = segs; skb; skb = skb->next) {
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ sh = sctp_hdr(skb);
+ sh->checksum = sctp_gso_make_checksum(skb);
+ }
+ }
+ }
+
+out:
+ return segs;
+}
+
+static const struct net_offload sctp_offload = {
+ .callbacks = {
+ .gso_segment = sctp_gso_segment,
+ },
+};
+
+static const struct net_offload sctp6_offload = {
+ .callbacks = {
+ .gso_segment = sctp_gso_segment,
+ },
+};
+
+int __init sctp_offload_init(void)
+{
+ int ret;
+
+ ret = inet_add_offload(&sctp_offload, IPPROTO_SCTP);
+ if (ret)
+ goto out;
+
+ ret = inet6_add_offload(&sctp6_offload, IPPROTO_SCTP);
+ if (ret)
+ goto ipv4;
+
+ return ret;
+
+ipv4:
+ inet_del_offload(&sctp_offload, IPPROTO_SCTP);
+out:
+ return ret;
+}
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 9844fe573..31b7bc358 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -84,18 +84,42 @@ static void sctp_packet_reset(struct sctp_packet *packet)
struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
__u32 vtag, int ecn_capable)
{
- struct sctp_chunk *chunk = NULL;
+ struct sctp_transport *tp = packet->transport;
+ struct sctp_association *asoc = tp->asoc;
pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
packet->vtag = vtag;
+ if (asoc && tp->dst) {
+ struct sock *sk = asoc->base.sk;
+
+ rcu_read_lock();
+ if (__sk_dst_get(sk) != tp->dst) {
+ dst_hold(tp->dst);
+ sk_setup_caps(sk, tp->dst);
+ }
+
+ if (sk_can_gso(sk)) {
+ struct net_device *dev = tp->dst->dev;
+
+ packet->max_size = dev->gso_max_size;
+ } else {
+ packet->max_size = asoc->pathmtu;
+ }
+ rcu_read_unlock();
+
+ } else {
+ packet->max_size = tp->pathmtu;
+ }
+
if (ecn_capable && sctp_packet_empty(packet)) {
- chunk = sctp_get_ecne_prepend(packet->transport->asoc);
+ struct sctp_chunk *chunk;
/* If there a is a prepend chunk stick it on the list before
* any other chunks get appended.
*/
+ chunk = sctp_get_ecne_prepend(asoc);
if (chunk)
sctp_packet_append_chunk(packet, chunk);
}
@@ -158,7 +182,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
sctp_xmit_t retval;
int error = 0;
- pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk);
+ pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
+ packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
case SCTP_XMIT_PMTU_FULL:
@@ -291,6 +316,8 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
packet->has_data = 1;
/* timestamp the chunk for rtx purposes */
chunk->sent_at = jiffies;
+ /* Mainly used for prsctp RTX policy */
+ chunk->sent_count++;
break;
case SCTP_CID_COOKIE_ECHO:
packet->has_cookie_echo = 1;
@@ -381,12 +408,15 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
struct sctp_transport *tp = packet->transport;
struct sctp_association *asoc = tp->asoc;
struct sctphdr *sh;
- struct sk_buff *nskb;
+ struct sk_buff *nskb = NULL, *head = NULL;
struct sctp_chunk *chunk, *tmp;
struct sock *sk;
int err = 0;
int padding; /* How much padding do we need? */
+ int pkt_size;
__u8 has_data = 0;
+ int gso = 0;
+ int pktcount = 0;
struct dst_entry *dst;
unsigned char *auth = NULL; /* pointer to auth in skb data */
@@ -400,18 +430,37 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
sk = chunk->skb->sk;
- /* Allocate the new skb. */
- nskb = alloc_skb(packet->size + MAX_HEADER, gfp);
- if (!nskb)
+ /* Allocate the head skb, or main one if not in GSO */
+ if (packet->size > tp->pathmtu && !packet->ipfragok) {
+ if (sk_can_gso(sk)) {
+ gso = 1;
+ pkt_size = packet->overhead;
+ } else {
+ /* If this happens, we trash this packet and try
+ * to build a new one, hopefully correct this
+ * time. Application may notice this error.
+ */
+ pr_err_once("Trying to GSO but underlying device doesn't support it.");
+ goto nomem;
+ }
+ } else {
+ pkt_size = packet->size;
+ }
+ head = alloc_skb(pkt_size + MAX_HEADER, gfp);
+ if (!head)
goto nomem;
+ if (gso) {
+ NAPI_GRO_CB(head)->last = head;
+ skb_shinfo(head)->gso_type = sk->sk_gso_type;
+ }
/* Make sure the outbound skb has enough header room reserved. */
- skb_reserve(nskb, packet->overhead + MAX_HEADER);
+ skb_reserve(head, packet->overhead + MAX_HEADER);
/* Set the owning socket so that we know where to get the
* destination IP address.
*/
- sctp_packet_set_owner_w(nskb, sk);
+ sctp_packet_set_owner_w(head, sk);
if (!sctp_transport_dst_check(tp)) {
sctp_transport_route(tp, NULL, sctp_sk(sk));
@@ -422,11 +471,11 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
dst = dst_clone(tp->dst);
if (!dst)
goto no_route;
- skb_dst_set(nskb, dst);
+ skb_dst_set(head, dst);
/* Build the SCTP header. */
- sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
- skb_reset_transport_header(nskb);
+ sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
+ skb_reset_transport_header(head);
sh->source = htons(packet->source_port);
sh->dest = htons(packet->destination_port);
@@ -441,90 +490,144 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
sh->vtag = htonl(packet->vtag);
sh->checksum = 0;
- /**
- * 6.10 Bundling
- *
- * An endpoint bundles chunks by simply including multiple
- * chunks in one outbound SCTP packet. ...
- */
-
- /**
- * 3.2 Chunk Field Descriptions
- *
- * The total length of a chunk (including Type, Length and
- * Value fields) MUST be a multiple of 4 bytes. If the length
- * of the chunk is not a multiple of 4 bytes, the sender MUST
- * pad the chunk with all zero bytes and this padding is not
- * included in the chunk length field. The sender should
- * never pad with more than 3 bytes.
- *
- * [This whole comment explains WORD_ROUND() below.]
- */
-
pr_debug("***sctp_transmit_packet***\n");
- list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
- list_del_init(&chunk->list);
- if (sctp_chunk_is_data(chunk)) {
- /* 6.3.1 C4) When data is in flight and when allowed
- * by rule C5, a new RTT measurement MUST be made each
- * round trip. Furthermore, new RTT measurements
- * SHOULD be made no more than once per round-trip
- * for a given destination transport address.
- */
+ do {
+ /* Set up convenience variables... */
+ chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
+ pktcount++;
- if (!chunk->resent && !tp->rto_pending) {
- chunk->rtt_in_progress = 1;
- tp->rto_pending = 1;
+ /* Calculate packet size, so it fits in PMTU. Leave
+ * other chunks for the next packets.
+ */
+ if (gso) {
+ pkt_size = packet->overhead;
+ list_for_each_entry(chunk, &packet->chunk_list, list) {
+ int padded = WORD_ROUND(chunk->skb->len);
+
+ if (pkt_size + padded > tp->pathmtu)
+ break;
+ pkt_size += padded;
}
- has_data = 1;
- }
+ /* Allocate a new skb. */
+ nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
+ if (!nskb)
+ goto nomem;
- padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
- if (padding)
- memset(skb_put(chunk->skb, padding), 0, padding);
+ /* Make sure the outbound skb has enough header
+ * room reserved.
+ */
+ skb_reserve(nskb, packet->overhead + MAX_HEADER);
+ } else {
+ nskb = head;
+ }
- /* if this is the auth chunk that we are adding,
- * store pointer where it will be added and put
- * the auth into the packet.
+ /**
+ * 3.2 Chunk Field Descriptions
+ *
+ * The total length of a chunk (including Type, Length and
+ * Value fields) MUST be a multiple of 4 bytes. If the length
+ * of the chunk is not a multiple of 4 bytes, the sender MUST
+ * pad the chunk with all zero bytes and this padding is not
+ * included in the chunk length field. The sender should
+ * never pad with more than 3 bytes.
+ *
+ * [This whole comment explains WORD_ROUND() below.]
*/
- if (chunk == packet->auth)
- auth = skb_tail_pointer(nskb);
- memcpy(skb_put(nskb, chunk->skb->len),
+ pkt_size -= packet->overhead;
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
+ if (sctp_chunk_is_data(chunk)) {
+ /* 6.3.1 C4) When data is in flight and when allowed
+ * by rule C5, a new RTT measurement MUST be made each
+ * round trip. Furthermore, new RTT measurements
+ * SHOULD be made no more than once per round-trip
+ * for a given destination transport address.
+ */
+
+ if (!chunk->resent && !tp->rto_pending) {
+ chunk->rtt_in_progress = 1;
+ tp->rto_pending = 1;
+ }
+
+ has_data = 1;
+ }
+
+ padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+ if (padding)
+ memset(skb_put(chunk->skb, padding), 0, padding);
+
+ /* if this is the auth chunk that we are adding,
+ * store pointer where it will be added and put
+ * the auth into the packet.
+ */
+ if (chunk == packet->auth)
+ auth = skb_tail_pointer(nskb);
+
+ memcpy(skb_put(nskb, chunk->skb->len),
chunk->skb->data, chunk->skb->len);
- pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, "
- "rtt_in_progress:%d\n", chunk,
- sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
- chunk->has_tsn ? "TSN" : "No TSN",
- chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0,
- ntohs(chunk->chunk_hdr->length), chunk->skb->len,
- chunk->rtt_in_progress);
-
- /*
- * If this is a control chunk, this is our last
- * reference. Free data chunks after they've been
- * acknowledged or have failed.
+ pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n",
+ chunk,
+ sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
+ chunk->has_tsn ? "TSN" : "No TSN",
+ chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0,
+ ntohs(chunk->chunk_hdr->length), chunk->skb->len,
+ chunk->rtt_in_progress);
+
+ /* If this is a control chunk, this is our last
+ * reference. Free data chunks after they've been
+ * acknowledged or have failed.
+ * Re-queue auth chunks if needed.
+ */
+ pkt_size -= WORD_ROUND(chunk->skb->len);
+
+ if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
+ sctp_chunk_free(chunk);
+
+ if (!pkt_size)
+ break;
+ }
+
+ /* SCTP-AUTH, Section 6.2
+ * The sender MUST calculate the MAC as described in RFC2104 [2]
+ * using the hash function H as described by the MAC Identifier and
+ * the shared association key K based on the endpoint pair shared key
+ * described by the shared key identifier. The 'data' used for the
+ * computation of the AUTH-chunk is given by the AUTH chunk with its
+ * HMAC field set to zero (as shown in Figure 6) followed by all
+ * chunks that are placed after the AUTH chunk in the SCTP packet.
*/
- if (!sctp_chunk_is_data(chunk))
- sctp_chunk_free(chunk);
- }
+ if (auth)
+ sctp_auth_calculate_hmac(asoc, nskb,
+ (struct sctp_auth_chunk *)auth,
+ gfp);
+
+ if (packet->auth) {
+ if (!list_empty(&packet->chunk_list)) {
+ /* We will generate more packets, so re-queue
+ * auth chunk.
+ */
+ list_add(&packet->auth->list,
+ &packet->chunk_list);
+ } else {
+ sctp_chunk_free(packet->auth);
+ packet->auth = NULL;
+ }
+ }
- /* SCTP-AUTH, Section 6.2
- * The sender MUST calculate the MAC as described in RFC2104 [2]
- * using the hash function H as described by the MAC Identifier and
- * the shared association key K based on the endpoint pair shared key
- * described by the shared key identifier. The 'data' used for the
- * computation of the AUTH-chunk is given by the AUTH chunk with its
- * HMAC field set to zero (as shown in Figure 6) followed by all
- * chunks that are placed after the AUTH chunk in the SCTP packet.
- */
- if (auth)
- sctp_auth_calculate_hmac(asoc, nskb,
- (struct sctp_auth_chunk *)auth,
- gfp);
+ if (!gso)
+ break;
+
+ if (skb_gro_receive(&head, nskb))
+ goto nomem;
+ nskb = NULL;
+ if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
+ sk->sk_gso_max_segs))
+ goto nomem;
+ } while (!list_empty(&packet->chunk_list));
/* 2) Calculate the Adler-32 checksum of the whole packet,
* including the SCTP common header and all the
@@ -532,16 +635,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
*
* Note: Adler-32 is no longer applicable, as has been replaced
* by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
+ *
+ * If it's a GSO packet, it's postponed to sctp_skb_segment.
*/
- if (!sctp_checksum_disable) {
- if (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
- (dst_xfrm(dst) != NULL) || packet->ipfragok) {
- sh->checksum = sctp_compute_cksum(nskb, 0);
+ if (!sctp_checksum_disable || gso) {
+ if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
+ dst_xfrm(dst) || packet->ipfragok)) {
+ sh->checksum = sctp_compute_cksum(head, 0);
} else {
/* no need to seed pseudo checksum for SCTP */
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = skb_transport_header(nskb) - nskb->head;
- nskb->csum_offset = offsetof(struct sctphdr, checksum);
+ head->ip_summed = CHECKSUM_PARTIAL;
+ head->csum_start = skb_transport_header(head) - head->head;
+ head->csum_offset = offsetof(struct sctphdr, checksum);
}
}
@@ -557,7 +662,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
* Note: The works for IPv6 layer checks this bit too later
* in transmission. See IP6_ECN_flow_xmit().
*/
- tp->af_specific->ecn_capable(nskb->sk);
+ tp->af_specific->ecn_capable(sk);
/* Set up the IP options. */
/* BUG: not implemented
@@ -566,7 +671,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
/* Dump that on IP! */
if (asoc) {
- asoc->stats.opackets++;
+ asoc->stats.opackets += pktcount;
if (asoc->peer.last_sent_to != tp)
/* Considering the multiple CPU scenario, this is a
* "correcter" place for last_sent_to. --xguo
@@ -589,16 +694,36 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
}
}
- pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len);
+ pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len);
- nskb->ignore_df = packet->ipfragok;
- tp->af_specific->sctp_xmit(nskb, tp);
+ if (gso) {
+ /* Cleanup our debris for IP stacks */
+ memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
+ sizeof(struct inet6_skb_parm)));
+
+ skb_shinfo(head)->gso_segs = pktcount;
+ skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
+
+ /* We have to refresh this in case we are xmiting to
+ * more than one transport at a time
+ */
+ rcu_read_lock();
+ if (__sk_dst_get(sk) != tp->dst) {
+ dst_hold(tp->dst);
+ sk_setup_caps(sk, tp->dst);
+ }
+ rcu_read_unlock();
+ }
+ head->ignore_df = packet->ipfragok;
+ tp->af_specific->sctp_xmit(head, tp);
out:
sctp_packet_reset(packet);
return err;
no_route:
- kfree_skb(nskb);
+ kfree_skb(head);
+ if (nskb != head)
+ kfree_skb(nskb);
if (asoc)
IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
@@ -623,6 +748,8 @@ err:
}
goto out;
nomem:
+ if (packet->auth && list_empty(&packet->auth->list))
+ sctp_chunk_free(packet->auth);
err = -ENOMEM;
goto err;
}
@@ -751,39 +878,74 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
struct sctp_chunk *chunk,
u16 chunk_len)
{
- size_t psize;
- size_t pmtu;
- int too_big;
+ size_t psize, pmtu, maxsize;
sctp_xmit_t retval = SCTP_XMIT_OK;
psize = packet->size;
- pmtu = ((packet->transport->asoc) ?
- (packet->transport->asoc->pathmtu) :
- (packet->transport->pathmtu));
-
- too_big = (psize + chunk_len > pmtu);
+ if (packet->transport->asoc)
+ pmtu = packet->transport->asoc->pathmtu;
+ else
+ pmtu = packet->transport->pathmtu;
/* Decide if we need to fragment or resubmit later. */
- if (too_big) {
- /* It's OK to fragmet at IP level if any one of the following
+ if (psize + chunk_len > pmtu) {
+ /* It's OK to fragment at IP level if any one of the following
* is true:
- * 1. The packet is empty (meaning this chunk is greater
- * the MTU)
- * 2. The chunk we are adding is a control chunk
- * 3. The packet doesn't have any data in it yet and data
- * requires authentication.
+ * 1. The packet is empty (meaning this chunk is greater
+ * the MTU)
+ * 2. The packet doesn't have any data in it yet and data
+ * requires authentication.
*/
- if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) ||
+ if (sctp_packet_empty(packet) ||
(!packet->has_data && chunk->auth)) {
/* We no longer do re-fragmentation.
* Just fragment at the IP layer, if we
* actually hit this condition
*/
packet->ipfragok = 1;
- } else {
- retval = SCTP_XMIT_PMTU_FULL;
+ goto out;
}
+
+ /* Similarly, if this chunk was built before a PMTU
+ * reduction, we have to fragment it at IP level now. So
+ * if the packet already contains something, we need to
+ * flush.
+ */
+ maxsize = pmtu - packet->overhead;
+ if (packet->auth)
+ maxsize -= WORD_ROUND(packet->auth->skb->len);
+ if (chunk_len > maxsize)
+ retval = SCTP_XMIT_PMTU_FULL;
+
+ /* It is also okay to fragment if the chunk we are
+ * adding is a control chunk, but only if current packet
+ * is not a GSO one otherwise it causes fragmentation of
+ * a large frame. So in this case we allow the
+ * fragmentation by forcing it to be in a new packet.
+ */
+ if (!sctp_chunk_is_data(chunk) && packet->has_data)
+ retval = SCTP_XMIT_PMTU_FULL;
+
+ if (psize + chunk_len > packet->max_size)
+ /* Hit GSO/PMTU limit, gotta flush */
+ retval = SCTP_XMIT_PMTU_FULL;
+
+ if (!packet->transport->burst_limited &&
+ psize + chunk_len > (packet->transport->cwnd >> 1))
+ /* Do not allow a single GSO packet to use more
+ * than half of cwnd.
+ */
+ retval = SCTP_XMIT_PMTU_FULL;
+
+ if (packet->transport->burst_limited &&
+ psize + chunk_len > (packet->transport->burst_limited >> 1))
+ /* Do not allow a single GSO packet to use more
+ * than half of original cwnd.
+ */
+ retval = SCTP_XMIT_PMTU_FULL;
+ /* Otherwise it will fit in the GSO packet */
}
+out:
return retval;
}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 084718f9b..107233da5 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -326,6 +326,9 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
sctp_chunk_hold(chunk);
sctp_outq_tail_data(q, chunk);
+ if (chunk->asoc->peer.prsctp_capable &&
+ SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+ chunk->asoc->sent_cnt_removable++;
if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
else
@@ -372,6 +375,96 @@ static void sctp_insert_list(struct list_head *head, struct list_head *new)
list_add_tail(new, head);
}
+static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
+ struct sctp_sndrcvinfo *sinfo,
+ struct list_head *queue, int msg_len)
+{
+ struct sctp_chunk *chk, *temp;
+
+ list_for_each_entry_safe(chk, temp, queue, transmitted_list) {
+ if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+ chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
+ continue;
+
+ list_del_init(&chk->transmitted_list);
+ sctp_insert_list(&asoc->outqueue.abandoned,
+ &chk->transmitted_list);
+
+ asoc->sent_cnt_removable--;
+ asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
+
+ if (!chk->tsn_gap_acked) {
+ if (chk->transport)
+ chk->transport->flight_size -=
+ sctp_data_size(chk);
+ asoc->outqueue.outstanding_bytes -= sctp_data_size(chk);
+ }
+
+ msg_len -= SCTP_DATA_SNDSIZE(chk) +
+ sizeof(struct sk_buff) +
+ sizeof(struct sctp_chunk);
+ if (msg_len <= 0)
+ break;
+ }
+
+ return msg_len;
+}
+
+static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
+ struct sctp_sndrcvinfo *sinfo,
+ struct list_head *queue, int msg_len)
+{
+ struct sctp_chunk *chk, *temp;
+
+ list_for_each_entry_safe(chk, temp, queue, list) {
+ if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+ chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
+ continue;
+
+ list_del_init(&chk->list);
+ asoc->sent_cnt_removable--;
+ asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
+
+ msg_len -= SCTP_DATA_SNDSIZE(chk) +
+ sizeof(struct sk_buff) +
+ sizeof(struct sctp_chunk);
+ sctp_chunk_free(chk);
+ if (msg_len <= 0)
+ break;
+ }
+
+ return msg_len;
+}
+
+/* Abandon the chunks according their priorities */
+void sctp_prsctp_prune(struct sctp_association *asoc,
+ struct sctp_sndrcvinfo *sinfo, int msg_len)
+{
+ struct sctp_transport *transport;
+
+ if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable)
+ return;
+
+ msg_len = sctp_prsctp_prune_sent(asoc, sinfo,
+ &asoc->outqueue.retransmit,
+ msg_len);
+ if (msg_len <= 0)
+ return;
+
+ list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+ transports) {
+ msg_len = sctp_prsctp_prune_sent(asoc, sinfo,
+ &transport->transmitted,
+ msg_len);
+ if (msg_len <= 0)
+ return;
+ }
+
+ sctp_prsctp_prune_unsent(asoc, sinfo,
+ &asoc->outqueue.out_chunk_list,
+ msg_len);
+}
+
/* Mark all the eligible packets on a transport for retransmission. */
void sctp_retransmit_mark(struct sctp_outq *q,
struct sctp_transport *transport,
@@ -962,6 +1055,9 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
/* Mark as failed send. */
sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
+ if (asoc->peer.prsctp_capable &&
+ SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+ asoc->sent_cnt_removable--;
sctp_chunk_free(chunk);
continue;
}
@@ -1251,6 +1347,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
tsn = ntohl(tchunk->subh.data_hdr->tsn);
if (TSN_lte(tsn, ctsn)) {
list_del_init(&tchunk->transmitted_list);
+ if (asoc->peer.prsctp_capable &&
+ SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+ asoc->sent_cnt_removable--;
sctp_chunk_free(tchunk);
}
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d3d50daa2..7b523e3f5 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -240,6 +240,7 @@ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
port = &addr->v4.sin_port;
addr->v4.sin_family = AF_INET;
+ /* Always called on head skb, so this is safe */
sh = sctp_hdr(skb);
if (is_saddr) {
*port = sh->source;
@@ -1027,7 +1028,7 @@ static const struct proto_ops inet_seqpacket_ops = {
.setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
#ifdef CONFIG_COMPAT
@@ -1479,7 +1480,8 @@ static __init int sctp_init(void)
INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
}
- if (sctp_transport_hashtable_init())
+ status = sctp_transport_hashtable_init();
+ if (status)
goto err_thash_alloc;
pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize,
@@ -1516,6 +1518,9 @@ static __init int sctp_init(void)
if (status)
goto err_v6_add_protocol;
+ if (sctp_offload_init() < 0)
+ pr_crit("%s: Cannot add SCTP protocol offload\n", __func__);
+
out:
return status;
err_v6_add_protocol:
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 10bae2201..cef0cee18 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -13,6 +13,7 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r,
{
union sctp_addr laddr, paddr;
struct dst_entry *dst;
+ struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer;
laddr = list_entry(asoc->base.bind_addr.address_list.next,
struct sctp_sockaddr_entry, list)->a;
@@ -40,10 +41,15 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r,
}
r->idiag_state = asoc->state;
- r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
- r->idiag_retrans = asoc->rtx_data_chunks;
- r->idiag_expires = jiffies_to_msecs(
- asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] - jiffies);
+ if (timer_pending(t3_rtx)) {
+ r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
+ r->idiag_retrans = asoc->rtx_data_chunks;
+ r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies);
+ } else {
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+ r->idiag_expires = 0;
+ }
}
static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
@@ -266,28 +272,17 @@ out:
return err;
}
-static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
+static int sctp_sock_dump(struct sock *sk, void *p)
{
- struct sctp_endpoint *ep = tsp->asoc->ep;
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
struct sctp_comm_param *commp = p;
- struct sock *sk = ep->base.sk;
struct sk_buff *skb = commp->skb;
struct netlink_callback *cb = commp->cb;
const struct inet_diag_req_v2 *r = commp->r;
- struct sctp_association *assoc =
- list_entry(ep->asocs.next, struct sctp_association, asocs);
+ struct sctp_association *assoc;
int err = 0;
- /* find the ep only once through the transports by this condition */
- if (tsp->asoc != assoc)
- goto out;
-
- if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
- goto out;
-
lock_sock(sk);
- if (sk != assoc->base.sk)
- goto release;
list_for_each_entry(assoc, &ep->asocs, asocs) {
if (cb->args[4] < cb->args[1])
goto next;
@@ -306,7 +301,7 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
cb->nlh->nlmsg_seq,
NLM_F_MULTI, cb->nlh) < 0) {
cb->args[3] = 1;
- err = 2;
+ err = 1;
goto release;
}
cb->args[3] = 1;
@@ -315,7 +310,7 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
- err = 2;
+ err = 1;
goto release;
}
next:
@@ -327,10 +322,35 @@ next:
cb->args[4] = 0;
release:
release_sock(sk);
+ sock_put(sk);
return err;
+}
+
+static int sctp_get_sock(struct sctp_transport *tsp, void *p)
+{
+ struct sctp_endpoint *ep = tsp->asoc->ep;
+ struct sctp_comm_param *commp = p;
+ struct sock *sk = ep->base.sk;
+ struct netlink_callback *cb = commp->cb;
+ const struct inet_diag_req_v2 *r = commp->r;
+ struct sctp_association *assoc =
+ list_entry(ep->asocs.next, struct sctp_association, asocs);
+
+ /* find the ep only once through the transports by this condition */
+ if (tsp->asoc != assoc)
+ goto out;
+
+ if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
+ goto out;
+
+ sock_hold(sk);
+ cb->args[5] = (long)sk;
+
+ return 1;
+
out:
cb->args[2]++;
- return err;
+ return 0;
}
static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
@@ -350,7 +370,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
if (cb->args[4] < cb->args[1])
goto next;
- if ((r->idiag_states & ~TCPF_LISTEN) && !list_empty(&ep->asocs))
+ if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs))
goto next;
if (r->sdiag_family != AF_UNSPEC &&
@@ -466,10 +486,18 @@ skip:
* 2 : to record the transport pos of this time's traversal
* 3 : to mark if we have dumped the ep info of the current asoc
* 4 : to work as a temporary variable to traversal list
+ * 5 : to save the sk we get from travelsing the tsp list.
*/
- if (!(idiag_states & ~TCPF_LISTEN))
+ if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
goto done;
- sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp);
+
+next:
+ cb->args[5] = 0;
+ sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp);
+
+ if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp))
+ goto next;
+
done:
cb->args[1] = cb->args[4];
cb->args[4] = 0;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 56f364d8f..46ffecc57 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -108,14 +108,9 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
/* What was the inbound interface for this chunk? */
int sctp_chunk_iif(const struct sctp_chunk *chunk)
{
- struct sctp_af *af;
- int iif = 0;
-
- af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version));
- if (af)
- iif = af->skb_iif(chunk->skb);
+ struct sk_buff *skb = chunk->skb;
- return iif;
+ return SCTP_INPUT_CB(skb)->af->skb_iif(skb);
}
/* RFC 2960 3.3.2 Initiation (INIT) (1)
@@ -261,7 +256,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
chunksize += sizeof(ecap_param);
- if (net->sctp.prsctp_enable)
+ if (asoc->prsctp_enable)
chunksize += sizeof(prsctp_param);
/* ADDIP: Section 4.2.7:
@@ -355,7 +350,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
sctp_addto_param(retval, num_ext, extensions);
}
- if (net->sctp.prsctp_enable)
+ if (asoc->prsctp_enable)
sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
if (sp->adaptation_ind) {
@@ -1585,7 +1580,6 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
struct sctp_association *asoc;
struct sk_buff *skb;
sctp_scope_t scope;
- struct sctp_af *af;
/* Create the bare association. */
scope = sctp_scope(sctp_source(chunk));
@@ -1595,16 +1589,10 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
asoc->temp = 1;
skb = chunk->skb;
/* Create an entry for the source address of the packet. */
- af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version));
- if (unlikely(!af))
- goto fail;
- af->from_skb(&asoc->c.peer_addr, skb, 1);
+ SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1);
+
nodata:
return asoc;
-
-fail:
- sctp_association_free(asoc);
- return NULL;
}
/* Build a cookie representing asoc.
@@ -2024,8 +2012,8 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
for (i = 0; i < num_ext; i++) {
switch (param.ext->chunks[i]) {
case SCTP_CID_FWD_TSN:
- if (net->sctp.prsctp_enable && !asoc->peer.prsctp_capable)
- asoc->peer.prsctp_capable = 1;
+ if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
+ asoc->peer.prsctp_capable = 1;
break;
case SCTP_CID_AUTH:
/* if the peer reports AUTH, assume that he
@@ -2169,7 +2157,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net,
break;
case SCTP_PARAM_FWD_TSN_SUPPORT:
- if (net->sctp.prsctp_enable)
+ if (ep->prsctp_enable)
break;
goto fallthrough;
@@ -2653,7 +2641,7 @@ do_addr_param:
break;
case SCTP_PARAM_FWD_TSN_SUPPORT:
- if (net->sctp.prsctp_enable) {
+ if (asoc->prsctp_enable) {
asoc->peer.prsctp_capable = 1;
break;
}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index aa3712259..12d451933 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -806,8 +806,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
/* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */
if (sctp_state(asoc, SHUTDOWN_RECEIVED) &&
- sctp_sstate(sk, ESTABLISHED))
+ sctp_sstate(sk, ESTABLISHED)) {
+ sk->sk_state = SCTP_SS_CLOSING;
sk->sk_shutdown |= RCV_SHUTDOWN;
+ }
}
if (sctp_state(asoc, COOKIE_WAIT)) {
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index f1f08c8f2..d88bb2b0b 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6118,14 +6118,11 @@ static int sctp_eat_data(const struct sctp_association *asoc,
* chunk later.
*/
- if (!chunk->ecn_ce_done) {
- struct sctp_af *af;
+ if (asoc->peer.ecn_capable && !chunk->ecn_ce_done) {
+ struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af;
chunk->ecn_ce_done = 1;
- af = sctp_get_af_specific(
- ipver2af(ip_hdr(chunk->skb)->version));
-
- if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) {
+ if (af->is_ce(sctp_gso_headskb(chunk->skb))) {
/* Do real work as sideffect. */
sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE,
SCTP_U32(tsn));
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 7f5689a93..8ed2d99bd 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -202,7 +202,7 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id)
* could be a TCP-style listening socket or a socket which
* hasn't yet called connect() to establish an association.
*/
- if (!sctp_sstate(sk, ESTABLISHED))
+ if (!sctp_sstate(sk, ESTABLISHED) && !sctp_sstate(sk, CLOSING))
return NULL;
/* Get the first and the only association from the list. */
@@ -1068,7 +1068,7 @@ static int __sctp_connect(struct sock *sk,
* is already connected.
* It cannot be done even on a TCP-style listening socket.
*/
- if (sctp_sstate(sk, ESTABLISHED) ||
+ if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) ||
(sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
err = -EISCONN;
goto out_free;
@@ -1705,18 +1705,19 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
if (msg_name) {
/* Look for a matching association on the endpoint. */
asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport);
- if (!asoc) {
- /* If we could not find a matching association on the
- * endpoint, make sure that it is not a TCP-style
- * socket that already has an association or there is
- * no peeled-off association on another socket.
- */
- if ((sctp_style(sk, TCP) &&
- sctp_sstate(sk, ESTABLISHED)) ||
- sctp_endpoint_is_peeled_off(ep, &to)) {
- err = -EADDRNOTAVAIL;
- goto out_unlock;
- }
+
+ /* If we could not find a matching association on the
+ * endpoint, make sure that it is not a TCP-style
+ * socket that already has an association or there is
+ * no peeled-off association on another socket.
+ */
+ if (!asoc &&
+ ((sctp_style(sk, TCP) &&
+ (sctp_sstate(sk, ESTABLISHED) ||
+ sctp_sstate(sk, CLOSING))) ||
+ sctp_endpoint_is_peeled_off(ep, &to))) {
+ err = -EADDRNOTAVAIL;
+ goto out_unlock;
}
} else {
asoc = sctp_id2assoc(sk, associd);
@@ -1914,6 +1915,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
goto out_free;
}
+ if (sctp_wspace(asoc) < msg_len)
+ sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
+
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
if (!sctp_wspace(asoc)) {
err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
@@ -2063,7 +2067,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
{
struct sctp_ulpevent *event = NULL;
struct sctp_sock *sp = sctp_sk(sk);
- struct sk_buff *skb;
+ struct sk_buff *skb, *head_skb;
int copied;
int err = 0;
int skb_len;
@@ -2074,7 +2078,8 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
lock_sock(sk);
- if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) {
+ if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED) &&
+ !sctp_sstate(sk, CLOSING) && !sctp_sstate(sk, CLOSED)) {
err = -ENOTCONN;
goto out;
}
@@ -2099,12 +2104,16 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (err)
goto out_free;
- sock_recv_ts_and_drops(msg, sk, skb);
+ if (event->chunk && event->chunk->head_skb)
+ head_skb = event->chunk->head_skb;
+ else
+ head_skb = skb;
+ sock_recv_ts_and_drops(msg, sk, head_skb);
if (sctp_ulpevent_is_notification(event)) {
msg->msg_flags |= MSG_NOTIFICATION;
sp->pf->event_msgname(event, msg->msg_name, addr_len);
} else {
- sp->pf->skb_msgname(skb, msg->msg_name, addr_len);
+ sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len);
}
/* Check if we allow SCTP_NXTINFO. */
@@ -3661,6 +3670,80 @@ static int sctp_setsockopt_recvnxtinfo(struct sock *sk,
return 0;
}
+static int sctp_setsockopt_pr_supported(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(params))
+ goto out;
+
+ if (copy_from_user(&params, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (asoc) {
+ asoc->prsctp_enable = !!params.assoc_value;
+ } else if (!params.assoc_id) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ sp->ep->prsctp_enable = !!params.assoc_value;
+ } else {
+ goto out;
+ }
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_setsockopt_default_prinfo(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_default_prinfo info;
+ struct sctp_association *asoc;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(info))
+ goto out;
+
+ if (copy_from_user(&info, optval, sizeof(info))) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ if (info.pr_policy & ~SCTP_PR_SCTP_MASK)
+ goto out;
+
+ if (info.pr_policy == SCTP_PR_SCTP_NONE)
+ info.pr_value = 0;
+
+ asoc = sctp_id2assoc(sk, info.pr_assoc_id);
+ if (asoc) {
+ SCTP_PR_SET_POLICY(asoc->default_flags, info.pr_policy);
+ asoc->default_timetolive = info.pr_value;
+ } else if (!info.pr_assoc_id) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ SCTP_PR_SET_POLICY(sp->default_flags, info.pr_policy);
+ sp->default_timetolive = info.pr_value;
+ } else {
+ goto out;
+ }
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3821,6 +3904,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
case SCTP_RECVNXTINFO:
retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen);
break;
+ case SCTP_PR_SUPPORTED:
+ retval = sctp_setsockopt_pr_supported(sk, optval, optlen);
+ break;
+ case SCTP_DEFAULT_PRINFO:
+ retval = sctp_setsockopt_default_prinfo(sk, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -4003,6 +4092,8 @@ static int sctp_init_sock(struct sock *sk)
return -ESOCKTNOSUPPORT;
}
+ sk->sk_gso_type = SKB_GSO_SCTP;
+
/* Initialize default send parameters. These parameters can be
* modified with the SCTP_DEFAULT_SEND_PARAM socket option.
*/
@@ -4193,6 +4284,7 @@ static void sctp_shutdown(struct sock *sk, int how)
return;
if (how & SEND_SHUTDOWN) {
+ sk->sk_state = SCTP_SS_CLOSING;
ep = sctp_sk(sk)->ep;
if (!list_empty(&ep->asocs)) {
asoc = list_entry(ep->asocs.next,
@@ -4377,17 +4469,21 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
const union sctp_addr *paddr, void *p)
{
struct sctp_transport *transport;
- int err = 0;
+ int err = -ENOENT;
rcu_read_lock();
transport = sctp_addrs_lookup_transport(net, laddr, paddr);
if (!transport || !sctp_transport_hold(transport))
goto out;
- err = cb(transport, p);
+
+ sctp_association_hold(transport->asoc);
sctp_transport_put(transport);
-out:
rcu_read_unlock();
+ err = cb(transport, p);
+ sctp_association_put(transport->asoc);
+
+out:
return err;
}
EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
@@ -6164,6 +6260,148 @@ static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len,
return 0;
}
+static int sctp_getsockopt_pr_supported(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EFAULT;
+
+ if (len < sizeof(params)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(params);
+ if (copy_from_user(&params, optval, len))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (asoc) {
+ params.assoc_value = asoc->prsctp_enable;
+ } else if (!params.assoc_id) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ params.assoc_value = sp->ep->prsctp_enable;
+ } else {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (put_user(len, optlen))
+ goto out;
+
+ if (copy_to_user(optval, &params, len))
+ goto out;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_getsockopt_default_prinfo(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_default_prinfo info;
+ struct sctp_association *asoc;
+ int retval = -EFAULT;
+
+ if (len < sizeof(info)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(info);
+ if (copy_from_user(&info, optval, len))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, info.pr_assoc_id);
+ if (asoc) {
+ info.pr_policy = SCTP_PR_POLICY(asoc->default_flags);
+ info.pr_value = asoc->default_timetolive;
+ } else if (!info.pr_assoc_id) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ info.pr_policy = SCTP_PR_POLICY(sp->default_flags);
+ info.pr_value = sp->default_timetolive;
+ } else {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (put_user(len, optlen))
+ goto out;
+
+ if (copy_to_user(optval, &info, len))
+ goto out;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_prstatus params;
+ struct sctp_association *asoc;
+ int policy;
+ int retval = -EINVAL;
+
+ if (len < sizeof(params))
+ goto out;
+
+ len = sizeof(params);
+ if (copy_from_user(&params, optval, len)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ policy = params.sprstat_policy;
+ if (policy & ~SCTP_PR_SCTP_MASK)
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
+ if (!asoc)
+ goto out;
+
+ if (policy == SCTP_PR_SCTP_NONE) {
+ params.sprstat_abandoned_unsent = 0;
+ params.sprstat_abandoned_sent = 0;
+ for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
+ params.sprstat_abandoned_unsent +=
+ asoc->abandoned_unsent[policy];
+ params.sprstat_abandoned_sent +=
+ asoc->abandoned_sent[policy];
+ }
+ } else {
+ params.sprstat_abandoned_unsent =
+ asoc->abandoned_unsent[__SCTP_PR_INDEX(policy)];
+ params.sprstat_abandoned_sent =
+ asoc->abandoned_sent[__SCTP_PR_INDEX(policy)];
+ }
+
+ if (put_user(len, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, &params, len)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -6317,6 +6555,17 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
case SCTP_RECVNXTINFO:
retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen);
break;
+ case SCTP_PR_SUPPORTED:
+ retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen);
+ break;
+ case SCTP_DEFAULT_PRINFO:
+ retval = sctp_getsockopt_default_prinfo(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_PR_ASSOC_STATUS:
+ retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
+ optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -6864,7 +7113,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
if (cmsgs->srinfo->sinfo_flags &
~(SCTP_UNORDERED | SCTP_ADDR_OVER |
- SCTP_SACK_IMMEDIATELY |
+ SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK |
SCTP_ABORT | SCTP_EOF))
return -EINVAL;
break;
@@ -6888,7 +7137,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
if (cmsgs->sinfo->snd_flags &
~(SCTP_UNORDERED | SCTP_ADDR_OVER |
- SCTP_SACK_IMMEDIATELY |
+ SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK |
SCTP_ABORT | SCTP_EOF))
return -EINVAL;
break;
@@ -7565,10 +7814,13 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
/* If the association on the newsk is already closed before accept()
* is called, set RCV_SHUTDOWN flag.
*/
- if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP))
+ if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) {
+ newsk->sk_state = SCTP_SS_CLOSED;
newsk->sk_shutdown |= RCV_SHUTDOWN;
+ } else {
+ newsk->sk_state = SCTP_SS_ESTABLISHED;
+ }
- newsk->sk_state = SCTP_SS_ESTABLISHED;
release_sock(newsk);
}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d1e38308f..d85b803da 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -51,7 +51,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event);
/* Initialize an ULP event from an given skb. */
static void sctp_ulpevent_init(struct sctp_ulpevent *event,
- int msg_flags,
+ __u16 msg_flags,
unsigned int len)
{
memset(event, 0, sizeof(struct sctp_ulpevent));
@@ -60,7 +60,7 @@ static void sctp_ulpevent_init(struct sctp_ulpevent *event,
}
/* Create a new sctp_ulpevent. */
-static struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
+static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags,
gfp_t gfp)
{
struct sctp_ulpevent *event;
@@ -91,6 +91,7 @@ int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event)
static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
const struct sctp_association *asoc)
{
+ struct sctp_chunk *chunk = event->chunk;
struct sk_buff *skb;
/* Cast away the const, as we are just wanting to
@@ -101,6 +102,8 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
event->asoc = (struct sctp_association *)asoc;
atomic_add(event->rmem_len, &event->asoc->rmem_alloc);
sctp_skb_set_owner_r(skb, asoc->base.sk);
+ if (chunk && chunk->head_skb && !chunk->head_skb->sk)
+ chunk->head_skb->sk = asoc->base.sk;
}
/* A simple destructor to give up the reference to the association. */
@@ -699,6 +702,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
*/
sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff));
+ /* And hold the chunk as we need it for getting the IP headers
+ * later in recvmsg
+ */
+ sctp_chunk_hold(chunk);
+ event->chunk = chunk;
+
sctp_ulpevent_receive_data(event, asoc);
event->stream = ntohs(chunk->subh.data_hdr->stream);
@@ -710,11 +719,11 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
}
event->tsn = ntohl(chunk->subh.data_hdr->tsn);
event->msg_flags |= chunk->chunk_hdr->flags;
- event->iif = sctp_chunk_iif(chunk);
return event;
fail_mark:
+ sctp_chunk_put(chunk);
kfree_skb(skb);
fail:
return NULL;
@@ -1007,6 +1016,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
done:
sctp_assoc_rwnd_increase(event->asoc, len);
+ sctp_chunk_put(event->chunk);
sctp_ulpevent_release_owner(event);
}
@@ -1029,6 +1039,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
}
done:
+ sctp_chunk_put(event->chunk);
sctp_ulpevent_release_owner(event);
}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index ec166d2bd..877e55066 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -204,7 +204,9 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
/* If the socket is just going to throw this away, do not
* even try to deliver it.
*/
- if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
+ if (sk->sk_shutdown & RCV_SHUTDOWN &&
+ (sk->sk_shutdown & SEND_SHUTDOWN ||
+ !sctp_ulpevent_is_notification(event)))
goto out_free;
if (!sctp_ulpevent_is_notification(event)) {
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 040ff627c..a7e42f9a4 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
ret = kstrtoul(val, 0, &num);
if (ret == -EINVAL)
goto out_inval;
- nbits = fls(num);
- if (num > (1U << nbits))
- nbits++;
+ nbits = fls(num - 1);
if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
goto out_inval;
*(unsigned int *)kp->arg = nbits;
@@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
bool
-rpcauth_cred_key_to_expire(struct rpc_cred *cred)
+rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred)
{
+ if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
+ return false;
if (!cred->cr_ops->crkey_to_expire)
return false;
return cred->cr_ops->crkey_to_expire(cred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 54dd3fdea..168219535 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
/* Fast track for non crkey_timeout (no key) underlying credentials */
- if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags))
+ if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
return 0;
/* Fast track for the normal case */
@@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
if (IS_ERR(tcred))
return -EACCES;
- if (!tcred->cr_ops->crkey_timeout) {
- set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
- ret = 0;
- goto out_put;
- }
-
/* Test for the almost error case */
ret = tcred->cr_ops->crkey_timeout(tcred);
if (ret != 0) {
@@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
}
-out_put:
put_rpccred(tcred);
return ret;
}
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index bf4b0e98f..976c7812b 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1017,8 +1017,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_flags = 0;
auth->au_ops = &authgss_ops;
auth->au_flavor = flavor;
+ if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
+ auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
atomic_set(&auth->au_count, 1);
kref_init(&gss_auth->kref);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 65427492b..605958353 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
.qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_INTEGRITY,
.name = "krb5i",
+ .datatouch = true,
},
[2] = {
.pseudoflavor = RPC_AUTH_GSS_KRB5P,
.qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_PRIVACY,
.name = "krb5p",
+ .datatouch = true,
},
};
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 7063d856a..5fec3abbe 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
}
EXPORT_SYMBOL(gss_pseudoflavor_to_service);
+bool
+gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+ return gm->gm_pfs[i].datatouch;
+ }
+ return false;
+}
+
char *
gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
{
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 4605dc73d..d8582028b 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1231,8 +1231,9 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
if (status)
goto out;
- dprintk("RPC: svcauth_gss: gss major status = %d\n",
- ud.major_status);
+ dprintk("RPC: svcauth_gss: gss major status = %d "
+ "minor status = %d\n",
+ ud.major_status, ud.minor_status);
switch (ud.major_status) {
case GSS_S_CONTINUE_NEEDED:
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 8d9eb4d5d..4d17376b2 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -115,6 +115,7 @@ static
struct rpc_auth null_auth = {
.au_cslack = NUL_CALLSLACK,
.au_rslack = NUL_REPLYSLACK,
+ .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
.au_ops = &authnull_ops,
.au_flavor = RPC_AUTH_NULL,
.au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 9f65452b7..a99278c98 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -228,6 +228,7 @@ static
struct rpc_auth unix_auth = {
.au_cslack = UNX_CALLSLACK,
.au_rslack = NUL_REPLYSLACK,
+ .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
.au_ops = &authunix_ops,
.au_flavor = RPC_AUTH_UNIX,
.au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 553bf95f7..4d8e11f94 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -362,7 +362,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
cache_purge(cd);
spin_lock(&cache_list_lock);
write_lock(&cd->hash_lock);
- if (cd->entries || atomic_read(&cd->inuse)) {
+ if (cd->entries) {
write_unlock(&cd->hash_lock);
spin_unlock(&cache_list_lock);
goto out;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2808d550d..66f23b376 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -453,7 +453,7 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
struct rpc_xprt_switch *xps;
if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) {
- WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+ WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
xps = args->bc_xprt->xpt_bc_xps;
xprt_switch_get(xps);
} else {
@@ -520,7 +520,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
char servername[48];
if (args->bc_xprt) {
- WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+ WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
xprt = args->bc_xprt->xpt_bc_xprt;
if (xprt) {
xprt_get(xprt);
@@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata)
kfree(data);
}
-const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
+static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
.rpc_call_done = rpc_cb_add_xprt_done,
.rpc_release = rpc_cb_add_xprt_release,
};
@@ -2638,6 +2638,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
{
struct rpc_xprt_switch *xps;
struct rpc_xprt *xprt;
+ unsigned long reconnect_timeout;
unsigned char resvport;
int ret = 0;
@@ -2649,6 +2650,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
return -EAGAIN;
}
resvport = xprt->resvport;
+ reconnect_timeout = xprt->max_reconnect_timeout;
rcu_read_unlock();
xprt = xprt_create_transport(xprtargs);
@@ -2657,6 +2659,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
goto out_put_switch;
}
xprt->resvport = resvport;
+ xprt->max_reconnect_timeout = reconnect_timeout;
rpc_xprt_switch_set_roundrobin(xps);
if (setup) {
@@ -2673,6 +2676,27 @@ out_put_switch:
}
EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
+static int
+rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *data)
+{
+ unsigned long timeout = *((unsigned long *)data);
+
+ if (timeout < xprt->max_reconnect_timeout)
+ xprt->max_reconnect_timeout = timeout;
+ return 0;
+}
+
+void
+rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
+{
+ rpc_clnt_iterate_for_each_xprt(clnt,
+ rpc_xprt_cap_max_reconnect_timeout,
+ &timeo);
+}
+EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static void rpc_show_header(void)
{
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index fc48eca21..84f98cbe3 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1386,7 +1386,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
struct dentry *root, *gssd_dentry;
- struct net *net = data;
+ struct net *net = get_net(sb->s_fs_info);
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int err;
@@ -1419,7 +1419,6 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
sb);
if (err)
goto err_depopulate;
- sb->s_fs_info = get_net(net);
mutex_unlock(&sn->pipefs_sb_lock);
return 0;
@@ -1448,7 +1447,8 @@ static struct dentry *
rpc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super);
+ struct net *net = current->nsproxy->net_ns;
+ return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super);
}
static void rpc_kill_sb(struct super_block *sb)
@@ -1468,9 +1468,9 @@ static void rpc_kill_sb(struct super_block *sb)
RPC_PIPEFS_UMOUNT,
sb);
mutex_unlock(&sn->pipefs_sb_lock);
- put_net(net);
out:
kill_litter_super(sb);
+ put_net(net);
}
static struct file_system_type rpc_pipe_fs_type = {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index fcfd48d26..9ae588511 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue;
/*
* rpciod-related stuff
*/
-struct workqueue_struct *rpciod_workqueue;
+struct workqueue_struct *rpciod_workqueue __read_mostly;
+struct workqueue_struct *xprtiod_workqueue __read_mostly;
/*
* Disable the timer for a given RPC task. Should be called with
@@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
* lockless RPC_IS_QUEUED() test) before we've had a chance to test
* the RPC_TASK_RUNNING flag.
*/
-static void rpc_make_runnable(struct rpc_task *task)
+static void rpc_make_runnable(struct workqueue_struct *wq,
+ struct rpc_task *task)
{
bool need_wakeup = !rpc_test_and_set_running(task);
@@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task)
return;
if (RPC_IS_ASYNC(task)) {
INIT_WORK(&task->u.tk_work, rpc_async_schedule);
- queue_work(rpciod_workqueue, &task->u.tk_work);
+ queue_work(wq, &task->u.tk_work);
} else
wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
}
@@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
/**
- * __rpc_do_wake_up_task - wake up a single rpc_task
+ * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task
+ * @wq: workqueue on which to run task
* @queue: wait queue
* @task: task to be woken up
*
* Caller must hold queue->lock, and have cleared the task queued flag.
*/
-static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue,
+ struct rpc_task *task)
{
dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
task->tk_pid, jiffies);
@@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
__rpc_remove_wait_queue(queue, task);
- rpc_make_runnable(task);
+ rpc_make_runnable(wq, task);
dprintk("RPC: __rpc_wake_up_task done\n");
}
@@ -436,16 +441,25 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
/*
* Wake up a queued task while the queue lock is being held
*/
-static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue, struct rpc_task *task)
{
if (RPC_IS_QUEUED(task)) {
smp_rmb();
if (task->tk_waitqueue == queue)
- __rpc_do_wake_up_task(queue, task);
+ __rpc_do_wake_up_task_on_wq(wq, queue, task);
}
}
/*
+ * Wake up a queued task while the queue lock is being held
+ */
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
+}
+
+/*
* Wake up a task on a specific queue
*/
void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
@@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue)
/*
* Wake up the first task on the wait queue.
*/
-struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue,
bool (*func)(struct rpc_task *, void *), void *data)
{
struct rpc_task *task = NULL;
@@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
task = __rpc_find_next_queued(queue);
if (task != NULL) {
if (func(task, data))
- rpc_wake_up_task_queue_locked(queue, task);
+ rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
else
task = NULL;
}
@@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
return task;
}
+
+/*
+ * Wake up the first task on the wait queue.
+ */
+struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+ bool (*func)(struct rpc_task *, void *), void *data)
+{
+ return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data);
+}
EXPORT_SYMBOL_GPL(rpc_wake_up_first);
static bool rpc_wake_up_next_func(struct rpc_task *task, void *data)
@@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task)
bool is_async = RPC_IS_ASYNC(task);
rpc_set_active(task);
- rpc_make_runnable(task);
+ rpc_make_runnable(rpciod_workqueue, task);
if (!is_async)
__rpc_execute(task);
}
@@ -1071,10 +1095,22 @@ static int rpciod_start(void)
* Create the rpciod thread and wait for it to start.
*/
dprintk("RPC: creating workqueue rpciod\n");
- /* Note: highpri because network receive is latency sensitive */
- wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
+ if (!wq)
+ goto out_failed;
rpciod_workqueue = wq;
- return rpciod_workqueue != NULL;
+ /* Note: highpri because network receive is latency sensitive */
+ wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ if (!wq)
+ goto free_rpciod;
+ xprtiod_workqueue = wq;
+ return 1;
+free_rpciod:
+ wq = rpciod_workqueue;
+ rpciod_workqueue = NULL;
+ destroy_workqueue(wq);
+out_failed:
+ return 0;
}
static void rpciod_stop(void)
@@ -1088,6 +1124,9 @@ static void rpciod_stop(void)
wq = rpciod_workqueue;
rpciod_workqueue = NULL;
destroy_workqueue(wq);
+ wq = xprtiod_workqueue;
+ xprtiod_workqueue = NULL;
+ destroy_workqueue(wq);
}
void
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index cc9852897..c5b0cb4f4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
/* Encode reply */
- if (test_bit(RQ_DROPME, &rqstp->rq_flags)) {
+ if (*statp == rpc_drop_reply ||
+ test_bit(RQ_DROPME, &rqstp->rq_flags)) {
if (procp->pc_release)
procp->pc_release(rqstp, NULL, rqstp->rq_resp);
goto dropit;
}
+ if (*statp == rpc_autherr_badcred) {
+ if (procp->pc_release)
+ procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+ goto err_bad_auth;
+ }
if (*statp == rpc_success &&
(xdr = procp->pc_encode) &&
!xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 4f01f6310..c3f652395 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -21,6 +21,10 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+static unsigned int svc_rpc_per_connection_limit __read_mostly;
+module_param(svc_rpc_per_connection_limit, uint, 0644);
+
+
static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
static int svc_deferred_recv(struct svc_rqst *rqstp);
static struct cache_deferred_req *svc_defer(struct cache_req *req);
@@ -329,12 +333,45 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
}
EXPORT_SYMBOL_GPL(svc_print_addr);
+static bool svc_xprt_slots_in_range(struct svc_xprt *xprt)
+{
+ unsigned int limit = svc_rpc_per_connection_limit;
+ int nrqsts = atomic_read(&xprt->xpt_nr_rqsts);
+
+ return limit == 0 || (nrqsts >= 0 && nrqsts < limit);
+}
+
+static bool svc_xprt_reserve_slot(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+ if (!test_bit(RQ_DATA, &rqstp->rq_flags)) {
+ if (!svc_xprt_slots_in_range(xprt))
+ return false;
+ atomic_inc(&xprt->xpt_nr_rqsts);
+ set_bit(RQ_DATA, &rqstp->rq_flags);
+ }
+ return true;
+}
+
+static void svc_xprt_release_slot(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) {
+ atomic_dec(&xprt->xpt_nr_rqsts);
+ svc_xprt_enqueue(xprt);
+ }
+}
+
static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
{
if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
return true;
- if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED)))
- return xprt->xpt_ops->xpo_has_wspace(xprt);
+ if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) {
+ if (xprt->xpt_ops->xpo_has_wspace(xprt) &&
+ svc_xprt_slots_in_range(xprt))
+ return true;
+ trace_svc_xprt_no_write_space(xprt);
+ return false;
+ }
return false;
}
@@ -480,8 +517,6 @@ void svc_reserve(struct svc_rqst *rqstp, int space)
atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
rqstp->rq_reserved = space;
- if (xprt->xpt_ops->xpo_adjust_wspace)
- xprt->xpt_ops->xpo_adjust_wspace(xprt);
svc_xprt_enqueue(xprt);
}
}
@@ -512,8 +547,8 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
rqstp->rq_res.head[0].iov_len = 0;
svc_reserve(rqstp, 0);
+ svc_xprt_release_slot(rqstp);
rqstp->rq_xprt = NULL;
-
svc_xprt_put(xprt);
}
@@ -781,7 +816,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
svc_add_new_temp_xprt(serv, newxpt);
else
module_put(xprt->xpt_class->xcl_owner);
- } else {
+ } else if (svc_xprt_reserve_slot(rqstp, xprt)) {
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
rqstp, rqstp->rq_pool->sp_id, xprt,
@@ -871,6 +906,7 @@ EXPORT_SYMBOL_GPL(svc_recv);
*/
void svc_drop(struct svc_rqst *rqstp)
{
+ trace_svc_drop(rqstp);
dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
svc_xprt_release(rqstp);
}
@@ -1148,6 +1184,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
spin_unlock(&xprt->xpt_lock);
dprintk("revisit canceled\n");
svc_xprt_put(xprt);
+ trace_svc_drop_deferred(dr);
kfree(dr);
return;
}
@@ -1205,6 +1242,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req)
set_bit(RQ_DROPME, &rqstp->rq_flags);
dr->handle.revisit = svc_revisit;
+ trace_svc_defer(rqstp);
return &dr->handle;
}
@@ -1245,6 +1283,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
struct svc_deferred_req,
handle.recent);
list_del_init(&dr->handle.recent);
+ trace_svc_revisit_deferred(dr);
} else
clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
spin_unlock(&xprt->xpt_lock);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index dadfec66d..57625f64e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -60,7 +60,6 @@
static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
int flags);
-static void svc_udp_data_ready(struct sock *);
static int svc_udp_recvfrom(struct svc_rqst *);
static int svc_udp_sendto(struct svc_rqst *);
static void svc_sock_detach(struct svc_xprt *);
@@ -398,48 +397,21 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp)
return svc_port_is_privileged(svc_addr(rqstp));
}
-static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
-{
- if (!wq)
- return false;
- /*
- * There should normally be a memory * barrier here--see
- * wq_has_sleeper().
- *
- * It appears that isn't currently necessary, though, basically
- * because callers all appear to have sufficient memory barriers
- * between the time the relevant change is made and the
- * time they call these callbacks.
- *
- * The nfsd code itself doesn't actually explicitly wait on
- * these waitqueues, but it may wait on them for example in
- * sendpage() or sendmsg() calls. (And those may be the only
- * places, since it it uses nonblocking reads.)
- *
- * Maybe we should add the memory barriers anyway, but these are
- * hot paths so we'd need to be convinced there's no sigificant
- * penalty.
- */
- return waitqueue_active(wq);
-}
-
/*
* INET callback when data has been received on the socket.
*/
-static void svc_udp_data_ready(struct sock *sk)
+static void svc_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), busy=%d\n",
svsk, sk,
test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
+ svsk->sk_odata(sk);
+ if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags))
+ svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
}
/*
@@ -448,56 +420,22 @@ static void svc_udp_data_ready(struct sock *sk)
static void svc_write_space(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
- wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+ svsk->sk_owspace(sk);
svc_xprt_enqueue(&svsk->sk_xprt);
}
-
- if (sunrpc_waitqueue_active(wq)) {
- dprintk("RPC svc_write_space: someone sleeping on %p\n",
- svsk);
- wake_up_interruptible(wq);
- }
}
static int svc_tcp_has_wspace(struct svc_xprt *xprt)
{
- struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- int required;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
return 1;
- required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
- if (sk_stream_wspace(svsk->sk_sk) >= required ||
- (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
- atomic_read(&xprt->xpt_reserved) == 0))
- return 1;
- set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
- return 0;
-}
-
-static void svc_tcp_write_space(struct sock *sk)
-{
- struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
- struct socket *sock = sk->sk_socket;
-
- if (!sk_stream_is_writeable(sk) || !sock)
- return;
- if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt))
- clear_bit(SOCK_NOSPACE, &sock->flags);
- svc_write_space(sk);
-}
-
-static void svc_tcp_adjust_wspace(struct svc_xprt *xprt)
-{
- struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-
- if (svc_tcp_has_wspace(xprt))
- clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
}
/*
@@ -746,7 +684,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
&svsk->sk_xprt, serv);
clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
- svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
+ svsk->sk_sk->sk_data_ready = svc_data_ready;
svsk->sk_sk->sk_write_space = svc_write_space;
/* initialise setting must have enough space to
@@ -786,11 +724,12 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
static void svc_tcp_listen_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq;
dprintk("svc: socket %p TCP (listen) state change %d\n",
sk, sk->sk_state);
+ if (svsk)
+ svsk->sk_odata(sk);
/*
* This callback may called twice when a new connection
* is established as a child socket inherits everything
@@ -808,10 +747,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
} else
printk("svc: socket %p: no user data\n", sk);
}
-
- wq = sk_sleep(sk);
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible_all(wq);
}
/*
@@ -820,7 +755,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
static void svc_tcp_state_change(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
sk, sk->sk_state, sk->sk_user_data);
@@ -828,26 +762,12 @@ static void svc_tcp_state_change(struct sock *sk)
if (!svsk)
printk("svc: socket %p: no user data\n", sk);
else {
- set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
- }
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible_all(wq);
-}
-
-static void svc_tcp_data_ready(struct sock *sk)
-{
- struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
-
- dprintk("svc: socket %p TCP data ready (svsk %p)\n",
- sk, sk->sk_user_data);
- if (svsk) {
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
+ svsk->sk_ostate(sk);
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
+ }
}
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
}
/*
@@ -901,6 +821,11 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
dprintk("%s: connect from %s\n", serv->sv_name,
__svc_print_addr(sin, buf, sizeof(buf)));
+ /* Reset the inherited callbacks before calling svc_setup_socket */
+ newsock->sk->sk_state_change = svsk->sk_ostate;
+ newsock->sk->sk_data_ready = svsk->sk_odata;
+ newsock->sk->sk_write_space = svsk->sk_owspace;
+
/* make sure that a write doesn't block forever when
* low on memory
*/
@@ -1317,7 +1242,6 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
.xpo_secure_port = svc_sock_secure_port,
- .xpo_adjust_wspace = svc_tcp_adjust_wspace,
};
static struct svc_xprt_class svc_tcp_class = {
@@ -1357,8 +1281,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
} else {
dprintk("setting up TCP socket for reading\n");
sk->sk_state_change = svc_tcp_state_change;
- sk->sk_data_ready = svc_tcp_data_ready;
- sk->sk_write_space = svc_tcp_write_space;
+ sk->sk_data_ready = svc_data_ready;
+ sk->sk_write_space = svc_write_space;
svsk->sk_reclen = 0;
svsk->sk_tcplen = 0;
@@ -1368,8 +1292,13 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- if (sk->sk_state != TCP_ESTABLISHED)
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ break;
+ default:
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ }
}
}
@@ -1428,17 +1357,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
/* Initialize the socket */
if (sock->type == SOCK_DGRAM)
svc_udp_init(svsk, serv);
- else {
- /* initialise setting must have enough space to
- * receive and respond to one request.
- */
- svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
- 4 * serv->sv_max_mesg);
+ else
svc_tcp_init(svsk, serv);
- }
- dprintk("svc: svc_setup_socket created %p (inet %p)\n",
- svsk, svsk->sk_sk);
+ dprintk("svc: svc_setup_socket created %p (inet %p), "
+ "listen %d close %d\n",
+ svsk, svsk->sk_sk,
+ test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags),
+ test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
return svsk;
}
@@ -1606,18 +1532,16 @@ static void svc_sock_detach(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sock *sk = svsk->sk_sk;
- wait_queue_head_t *wq;
dprintk("svc: svc_sock_detach(%p)\n", svsk);
/* put back the old socket callbacks */
+ lock_sock(sk);
sk->sk_state_change = svsk->sk_ostate;
sk->sk_data_ready = svsk->sk_odata;
sk->sk_write_space = svsk->sk_owspace;
-
- wq = sk_sleep(sk);
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
+ sk->sk_user_data = NULL;
+ release_sock(sk);
}
/*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 216a13857..ea244b291 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_atomic();
} else
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
}
/*
@@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
return;
- if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt))
+ if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
+ __xprt_lock_write_func, xprt))
return;
xprt_clear_locked(xprt);
}
@@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
return;
if (RPCXPRT_CONGESTED(xprt))
goto out_unlock;
- if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt))
+ if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
+ __xprt_lock_write_cong_func, xprt))
return;
out_unlock:
xprt_clear_locked(xprt);
@@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock);
}
@@ -672,12 +674,26 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN);
out:
spin_unlock_bh(&xprt->transport_lock);
}
+static bool
+xprt_has_timer(const struct rpc_xprt *xprt)
+{
+ return xprt->idle_timeout != 0;
+}
+
+static void
+xprt_schedule_autodisconnect(struct rpc_xprt *xprt)
+ __must_hold(&xprt->transport_lock)
+{
+ if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
+ mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout);
+}
+
static void
xprt_init_autodisconnect(unsigned long data)
{
@@ -686,10 +702,12 @@ xprt_init_autodisconnect(unsigned long data)
spin_lock(&xprt->transport_lock);
if (!list_empty(&xprt->recv))
goto out_abort;
+ /* Reset xprt->last_used to avoid connect/autodisconnect cycling */
+ xprt->last_used = jiffies;
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
goto out_abort;
spin_unlock(&xprt->transport_lock);
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
return;
out_abort:
spin_unlock(&xprt->transport_lock);
@@ -723,6 +741,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
goto out;
xprt->snd_task =NULL;
xprt->ops->release_xprt(xprt, NULL);
+ xprt_schedule_autodisconnect(xprt);
out:
spin_unlock_bh(&xprt->transport_lock);
wake_up_bit(&xprt->state, XPRT_LOCKED);
@@ -886,11 +905,6 @@ static void xprt_timer(struct rpc_task *task)
spin_unlock_bh(&xprt->transport_lock);
}
-static inline int xprt_has_timer(struct rpc_xprt *xprt)
-{
- return xprt->idle_timeout != 0;
-}
-
/**
* xprt_prepare_transmit - reserve the transport before sending a request
* @task: RPC task about to send a request
@@ -1278,9 +1292,7 @@ void xprt_release(struct rpc_task *task)
if (!list_empty(&req->rq_list))
list_del(&req->rq_list);
xprt->last_used = jiffies;
- if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
- mod_timer(&xprt->timer,
- xprt->last_used + xprt->idle_timeout);
+ xprt_schedule_autodisconnect(xprt);
spin_unlock_bh(&xprt->transport_lock);
if (req->rq_buffer)
xprt->ops->buf_free(req->rq_buffer);
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index dc9f3b513..ef19fa42c 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o \
- fmr_ops.o frwr_ops.o physical_ops.o \
+ fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 6326ebe8b..21cb3b150 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -19,13 +19,6 @@
* verb (fmr_op_unmap).
*/
-/* Transport recovery
- *
- * After a transport reconnect, fmr_op_map re-uses the MR already
- * allocated for the RPC, but generates a fresh rkey then maps the
- * MR again. This process is synchronous.
- */
-
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -35,62 +28,132 @@
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)
-static struct workqueue_struct *fmr_recovery_wq;
-
-#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
+/* Access mode of externally registered pages */
+enum {
+ RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ,
+};
-int
-fmr_alloc_recovery_wq(void)
+bool
+fmr_is_supported(struct rpcrdma_ia *ia)
{
- fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
- return !fmr_recovery_wq ? -ENOMEM : 0;
+ if (!ia->ri_device->alloc_fmr) {
+ pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
+ ia->ri_device->name);
+ return false;
+ }
+ return true;
}
-void
-fmr_destroy_recovery_wq(void)
+static int
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
{
- struct workqueue_struct *wq;
+ static struct ib_fmr_attr fmr_attr = {
+ .max_pages = RPCRDMA_MAX_FMR_SGES,
+ .max_maps = 1,
+ .page_shift = PAGE_SHIFT
+ };
- if (!fmr_recovery_wq)
- return;
+ mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
+ sizeof(u64), GFP_KERNEL);
+ if (!mw->fmr.fm_physaddrs)
+ goto out_free;
- wq = fmr_recovery_wq;
- fmr_recovery_wq = NULL;
- destroy_workqueue(wq);
+ mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
+ sizeof(*mw->mw_sg), GFP_KERNEL);
+ if (!mw->mw_sg)
+ goto out_free;
+
+ sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
+
+ mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
+ &fmr_attr);
+ if (IS_ERR(mw->fmr.fm_mr))
+ goto out_fmr_err;
+
+ return 0;
+
+out_fmr_err:
+ dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
+ PTR_ERR(mw->fmr.fm_mr));
+
+out_free:
+ kfree(mw->mw_sg);
+ kfree(mw->fmr.fm_physaddrs);
+ return -ENOMEM;
}
static int
__fmr_unmap(struct rpcrdma_mw *mw)
{
LIST_HEAD(l);
+ int rc;
- list_add(&mw->fmr.fmr->list, &l);
- return ib_unmap_fmr(&l);
+ list_add(&mw->fmr.fm_mr->list, &l);
+ rc = ib_unmap_fmr(&l);
+ list_del_init(&mw->fmr.fm_mr->list);
+ return rc;
}
-/* Deferred reset of a single FMR. Generate a fresh rkey by
- * replacing the MR. There's no recovery if this fails.
- */
static void
-__fmr_recovery_worker(struct work_struct *work)
+fmr_op_release_mr(struct rpcrdma_mw *r)
{
- struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
- mw_work);
- struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ LIST_HEAD(unmap_list);
+ int rc;
- __fmr_unmap(mw);
- rpcrdma_put_mw(r_xprt, mw);
- return;
+ /* Ensure MW is not on any rl_registered list */
+ if (!list_empty(&r->mw_list))
+ list_del(&r->mw_list);
+
+ kfree(r->fmr.fm_physaddrs);
+ kfree(r->mw_sg);
+
+ /* In case this one was left mapped, try to unmap it
+ * to prevent dealloc_fmr from failing with EBUSY
+ */
+ rc = __fmr_unmap(r);
+ if (rc)
+ pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
+ r, rc);
+
+ rc = ib_dealloc_fmr(r->fmr.fm_mr);
+ if (rc)
+ pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
+ r, rc);
+
+ kfree(r);
}
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
+/* Reset of a single FMR.
*/
static void
-__fmr_queue_recovery(struct rpcrdma_mw *mw)
+fmr_op_recover_mr(struct rpcrdma_mw *mw)
{
- INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
- queue_work(fmr_recovery_wq, &mw->mw_work);
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ int rc;
+
+ /* ORDER: invalidate first */
+ rc = __fmr_unmap(mw);
+
+ /* ORDER: then DMA unmap */
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (rc)
+ goto out_release;
+
+ rpcrdma_put_mw(r_xprt, mw);
+ r_xprt->rx_stats.mrs_recovered++;
+ return;
+
+out_release:
+ pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
+ r_xprt->rx_stats.mrs_orphaned++;
+
+ spin_lock(&r_xprt->rx_buf.rb_mwlock);
+ list_del(&mw->mw_all);
+ spin_unlock(&r_xprt->rx_buf.rb_mwlock);
+
+ fmr_op_release_mr(mw);
}
static int
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}
-static int
-fmr_op_init(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
- struct ib_fmr_attr fmr_attr = {
- .max_pages = RPCRDMA_MAX_FMR_SGES,
- .max_maps = 1,
- .page_shift = PAGE_SHIFT
- };
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- struct rpcrdma_mw *r;
- int i, rc;
-
- spin_lock_init(&buf->rb_mwlock);
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
-
- i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
- i += 2; /* head + tail */
- i *= buf->rb_max_requests; /* one set for each RPC slot */
- dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
-
- rc = -ENOMEM;
- while (i--) {
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- goto out;
-
- r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
- sizeof(u64), GFP_KERNEL);
- if (!r->fmr.physaddrs)
- goto out_free;
-
- r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
- if (IS_ERR(r->fmr.fmr))
- goto out_fmr_err;
-
- r->mw_xprt = r_xprt;
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
- return 0;
-
-out_fmr_err:
- rc = PTR_ERR(r->fmr.fmr);
- dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
- kfree(r->fmr.physaddrs);
-out_free:
- kfree(r);
-out:
- return rc;
-}
-
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static int
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
+ int nsegs, bool writing, struct rpcrdma_mw **out)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_device;
- enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
int len, pageoff, i, rc;
struct rpcrdma_mw *mw;
+ u64 *dma_pages;
- mw = seg1->rl_mw;
- seg1->rl_mw = NULL;
- if (!mw) {
- mw = rpcrdma_get_mw(r_xprt);
- if (!mw)
- return -ENOMEM;
- } else {
- /* this is a retransmit; generate a fresh rkey */
- rc = __fmr_unmap(mw);
- if (rc)
- return rc;
- }
+ mw = rpcrdma_get_mw(r_xprt);
+ if (!mw)
+ return -ENOBUFS;
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > RPCRDMA_MAX_FMR_SGES)
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
- rpcrdma_map_one(device, seg, direction);
- mw->fmr.physaddrs[i] = seg->mr_dma;
+ if (seg->mr_page)
+ sg_set_page(&mw->mw_sg[i],
+ seg->mr_page,
+ seg->mr_len,
+ offset_in_page(seg->mr_offset));
+ else
+ sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
+ seg->mr_len);
len += seg->mr_len;
++seg;
++i;
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
-
- rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs,
- i, seg1->mr_dma);
+ mw->mw_nents = i;
+ mw->mw_dir = rpcrdma_data_dir(writing);
+ if (i == 0)
+ goto out_dmamap_err;
+
+ if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir))
+ goto out_dmamap_err;
+
+ for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
+ dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
+ rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
+ dma_pages[0]);
if (rc)
goto out_maperr;
- seg1->rl_mw = mw;
- seg1->mr_rkey = mw->fmr.fmr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- return i;
+ mw->mw_handle = mw->fmr.fm_mr->rkey;
+ mw->mw_length = len;
+ mw->mw_offset = dma_pages[0] + pageoff;
-out_maperr:
- dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
- __func__, len, (unsigned long long)seg1->mr_dma,
- pageoff, i, rc);
- while (i--)
- rpcrdma_unmap_one(device, --seg);
- return rc;
-}
+ *out = mw;
+ return mw->mw_nents;
-static void
-__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- int nsegs = seg->mr_nsegs;
+out_dmamap_err:
+ pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+ mw->mw_sg, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
- while (nsegs--)
- rpcrdma_unmap_one(device, seg++);
+out_maperr:
+ pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+ len, (unsigned long long)dma_pages[0],
+ pageoff, mw->mw_nents, rc);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
}
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
*/
static void
fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct rpcrdma_mr_seg *seg;
- unsigned int i, nchunks;
- struct rpcrdma_mw *mw;
+ struct rpcrdma_mw *mw, *tmp;
LIST_HEAD(unmap_list);
int rc;
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* ORDER: Invalidate all of the req's MRs first
*
* ib_unmap_fmr() is slow, so use a single call instead
- * of one call per mapped MR.
+ * of one call per mapped FMR.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
-
- list_add(&mw->fmr.fmr->list, &unmap_list);
-
- i += seg->mr_nsegs;
- }
+ list_for_each_entry(mw, &req->rl_registered, mw_list)
+ list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
rc = ib_unmap_fmr(&unmap_list);
if (rc)
- pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+ goto out_reset;
/* ORDER: Now DMA unmap all of the req's MRs, and return
* them to the free MW list.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->mw_list);
+ list_del_init(&mw->fmr.fm_mr->list);
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ rpcrdma_put_mw(r_xprt, mw);
+ }
- __fmr_dma_unmap(r_xprt, seg);
- rpcrdma_put_mw(r_xprt, seg->rl_mw);
+ return;
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
+out_reset:
+ pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
- req->rl_nchunks = 0;
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->fmr.fm_mr->list);
+ fmr_op_recover_mr(mw);
+ }
}
/* Use a slow, safe mechanism to invalidate all memory regions
* that were registered for "req".
- *
- * In the asynchronous case, DMA unmapping occurs first here
- * because the rpcrdma_mr_seg is released immediately after this
- * call. It's contents won't be available in __fmr_dma_unmap later.
- * FIXME.
*/
static void
fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync)
{
- struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- unsigned int i;
-
- for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
-
- if (sync) {
- /* ORDER */
- __fmr_unmap(mw);
- __fmr_dma_unmap(r_xprt, seg);
- rpcrdma_put_mw(r_xprt, mw);
- } else {
- __fmr_dma_unmap(r_xprt, seg);
- __fmr_queue_recovery(mw);
- }
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
-}
-
-static void
-fmr_op_destroy(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
- int rc;
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- kfree(r->fmr.physaddrs);
- rc = ib_dealloc_fmr(r->fmr.fmr);
- if (rc)
- dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
- __func__, rc);
+ while (!list_empty(&req->rl_registered)) {
+ mw = list_first_entry(&req->rl_registered,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
- kfree(r);
+ if (sync)
+ fmr_op_recover_mr(mw);
+ else
+ rpcrdma_defer_mr_recovery(mw);
}
}
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
.ro_unmap_safe = fmr_op_unmap_safe,
+ .ro_recover_mr = fmr_op_recover_mr,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
- .ro_init = fmr_op_init,
- .ro_destroy = fmr_op_destroy,
+ .ro_init_mr = fmr_op_init_mr,
+ .ro_release_mr = fmr_op_release_mr,
.ro_displayname = "fmr",
};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index f02ab80aa..892b5e1d9 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -73,29 +73,71 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-static struct workqueue_struct *frwr_recovery_wq;
-
-#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
+bool
+frwr_is_supported(struct rpcrdma_ia *ia)
+{
+ struct ib_device_attr *attrs = &ia->ri_device->attrs;
+
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ goto out_not_supported;
+ if (attrs->max_fast_reg_page_list_len == 0)
+ goto out_not_supported;
+ return true;
+
+out_not_supported:
+ pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
+ ia->ri_device->name);
+ return false;
+}
-int
-frwr_alloc_recovery_wq(void)
+static int
+frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
{
- frwr_recovery_wq = alloc_workqueue("frwr_recovery",
- FRWR_RECOVERY_WQ_FLAGS, 0);
- return !frwr_recovery_wq ? -ENOMEM : 0;
+ unsigned int depth = ia->ri_max_frmr_depth;
+ struct rpcrdma_frmr *f = &r->frmr;
+ int rc;
+
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+ if (IS_ERR(f->fr_mr))
+ goto out_mr_err;
+
+ r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
+ if (!r->mw_sg)
+ goto out_list_err;
+
+ sg_init_table(r->mw_sg, depth);
+ init_completion(&f->fr_linv_done);
+ return 0;
+
+out_mr_err:
+ rc = PTR_ERR(f->fr_mr);
+ dprintk("RPC: %s: ib_alloc_mr status %i\n",
+ __func__, rc);
+ return rc;
+
+out_list_err:
+ rc = -ENOMEM;
+ dprintk("RPC: %s: sg allocation failure\n",
+ __func__);
+ ib_dereg_mr(f->fr_mr);
+ return rc;
}
-void
-frwr_destroy_recovery_wq(void)
+static void
+frwr_op_release_mr(struct rpcrdma_mw *r)
{
- struct workqueue_struct *wq;
+ int rc;
- if (!frwr_recovery_wq)
- return;
+ /* Ensure MW is not on any rl_registered list */
+ if (!list_empty(&r->mw_list))
+ list_del(&r->mw_list);
- wq = frwr_recovery_wq;
- frwr_recovery_wq = NULL;
- destroy_workqueue(wq);
+ rc = ib_dereg_mr(r->frmr.fr_mr);
+ if (rc)
+ pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
+ r, rc);
+ kfree(r->mw_sg);
+ kfree(r);
}
static int
@@ -124,90 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return 0;
}
-static void
-__frwr_reset_and_unmap(struct rpcrdma_mw *mw)
-{
- struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- int rc;
-
- rc = __frwr_reset_mr(ia, mw);
- ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
- if (rc)
- return;
- rpcrdma_put_mw(r_xprt, mw);
-}
-
-/* Deferred reset of a single FRMR. Generate a fresh rkey by
- * replacing the MR.
+/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
*
* There's no recovery if this fails. The FRMR is abandoned, but
* remains in rb_all. It will be cleaned up when the transport is
* destroyed.
*/
static void
-__frwr_recovery_worker(struct work_struct *work)
+frwr_op_recover_mr(struct rpcrdma_mw *mw)
{
- struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- mw_work);
-
- __frwr_reset_and_unmap(r);
-}
-
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
- */
-static void
-__frwr_queue_recovery(struct rpcrdma_mw *r)
-{
- INIT_WORK(&r->mw_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->mw_work);
-}
-
-static int
-__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, unsigned int depth)
-{
- struct rpcrdma_frmr *f = &r->frmr;
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int rc;
- f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(f->fr_mr))
- goto out_mr_err;
-
- r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
- if (!r->mw_sg)
- goto out_list_err;
-
- sg_init_table(r->mw_sg, depth);
-
- init_completion(&f->fr_linv_done);
-
- return 0;
+ rc = __frwr_reset_mr(ia, mw);
+ ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (rc)
+ goto out_release;
-out_mr_err:
- rc = PTR_ERR(f->fr_mr);
- dprintk("RPC: %s: ib_alloc_mr status %i\n",
- __func__, rc);
- return rc;
+ rpcrdma_put_mw(r_xprt, mw);
+ r_xprt->rx_stats.mrs_recovered++;
+ return;
-out_list_err:
- rc = -ENOMEM;
- dprintk("RPC: %s: sg allocation failure\n",
- __func__);
- ib_dereg_mr(f->fr_mr);
- return rc;
-}
+out_release:
+ pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
+ r_xprt->rx_stats.mrs_orphaned++;
-static void
-__frwr_release(struct rpcrdma_mw *r)
-{
- int rc;
+ spin_lock(&r_xprt->rx_buf.rb_mwlock);
+ list_del(&mw->mw_all);
+ spin_unlock(&r_xprt->rx_buf.rb_mwlock);
- rc = ib_dereg_mr(r->frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr status %i\n",
- __func__, rc);
- kfree(r->mw_sg);
+ frwr_op_release_mr(mw);
}
static int
@@ -343,54 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
complete_all(&frmr->fr_linv_done);
}
-static int
-frwr_op_init(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- int i;
-
- spin_lock_init(&buf->rb_mwlock);
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
-
- i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
- i += 2; /* head + tail */
- i *= buf->rb_max_requests; /* one set for each RPC slot */
- dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
-
- while (i--) {
- struct rpcrdma_mw *r;
- int rc;
-
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- rc = __frwr_init(r, pd, depth);
- if (rc) {
- kfree(r);
- return rc;
- }
-
- r->mw_xprt = r_xprt;
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
-
- return 0;
-}
-
/* Post a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static int
frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
+ int nsegs, bool writing, struct rpcrdma_mw **out)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
@@ -399,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int rc, i, n, dma_nents;
u8 key;
- mw = seg1->rl_mw;
- seg1->rl_mw = NULL;
+ mw = NULL;
do {
if (mw)
- __frwr_queue_recovery(mw);
+ rpcrdma_defer_mr_recovery(mw);
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
- return -ENOMEM;
+ return -ENOBUFS;
} while (mw->frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->frmr;
frmr->fr_state = FRMR_IS_VALID;
@@ -435,6 +383,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
}
mw->mw_nents = i;
mw->mw_dir = rpcrdma_data_dir(writing);
+ if (i == 0)
+ goto out_dmamap_err;
dma_nents = ib_dma_map_sg(ia->ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
@@ -468,36 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;
- seg1->rl_mw = mw;
- seg1->mr_rkey = mr->rkey;
- seg1->mr_base = mr->iova;
- seg1->mr_nsegs = mw->mw_nents;
- seg1->mr_len = mr->length;
+ mw->mw_handle = mr->rkey;
+ mw->mw_length = mr->length;
+ mw->mw_offset = mr->iova;
+ *out = mw;
return mw->mw_nents;
out_dmamap_err:
pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
mw->mw_sg, mw->mw_nents);
- return -ENOMEM;
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
out_mapmr_err:
pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
frmr->fr_mr, n, mw->mw_nents);
- rc = n < 0 ? n : -EIO;
- __frwr_queue_recovery(mw);
- return rc;
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
out_senderr:
- pr_err("rpcrdma: ib_post_send status %i\n", rc);
- __frwr_queue_recovery(mw);
- return rc;
+ pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
+ rpcrdma_defer_mr_recovery(mw);
+ return -ENOTCONN;
}
static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
{
- struct rpcrdma_mw *mw = seg->rl_mw;
struct rpcrdma_frmr *f = &mw->frmr;
struct ib_send_wr *invalidate_wr;
@@ -517,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
*/
static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg;
- unsigned int i, nchunks;
+ struct rpcrdma_mw *mw, *tmp;
struct rpcrdma_frmr *f;
- struct rpcrdma_mw *mw;
int rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -536,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* Chain the LOCAL_INV Work Requests and post them with
* a single ib_post_send() call.
*/
+ f = NULL;
invalidate_wrs = pos = prev = NULL;
- seg = NULL;
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
-
- pos = __frwr_prepare_linv_wr(seg);
+ list_for_each_entry(mw, &req->rl_registered, mw_list) {
+ pos = __frwr_prepare_linv_wr(mw);
if (!invalidate_wrs)
invalidate_wrs = pos;
else
prev->next = pos;
prev = pos;
-
- i += seg->mr_nsegs;
+ f = &mw->frmr;
}
- f = &seg->rl_mw->frmr;
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
@@ -576,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* them to the free MW list.
*/
unmap:
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
- seg->rl_mw = NULL;
-
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->mw_list);
ib_dma_unmap_sg(ia->ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
rpcrdma_put_mw(r_xprt, mw);
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
}
-
- req->rl_nchunks = 0;
return;
reset_mrs:
- pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+ pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
+ rdma_disconnect(ia->ri_id);
/* Find and reset the MRs in the LOCAL_INV WRs that did not
* get posted. This is synchronous, and slow.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
+ list_for_each_entry(mw, &req->rl_registered, mw_list) {
f = &mw->frmr;
-
if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
__frwr_reset_mr(ia, mw);
bad_wr = bad_wr->next;
}
-
- i += seg->mr_nsegs;
}
goto unmap;
}
@@ -620,38 +552,17 @@ static void
frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync)
{
- struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- unsigned int i;
- for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
+ while (!list_empty(&req->rl_registered)) {
+ mw = list_first_entry(&req->rl_registered,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
if (sync)
- __frwr_reset_and_unmap(mw);
+ frwr_op_recover_mr(mw);
else
- __frwr_queue_recovery(mw);
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
-}
-
-static void
-frwr_op_destroy(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
-
- /* Ensure stale MWs for "buf" are no longer in flight */
- flush_workqueue(frwr_recovery_wq);
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- __frwr_release(r);
- kfree(r);
+ rpcrdma_defer_mr_recovery(mw);
}
}
@@ -659,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync,
.ro_unmap_safe = frwr_op_unmap_safe,
+ .ro_recover_mr = frwr_op_recover_mr,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
- .ro_init = frwr_op_init,
- .ro_destroy = frwr_op_destroy,
+ .ro_init_mr = frwr_op_init_mr,
+ .ro_release_mr = frwr_op_release_mr,
.ro_displayname = "frwr",
};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 35a81096e..a47f170b2 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
* MR when they can.
*/
static int
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
- int n, int nsegs)
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
{
size_t page_offset;
u32 remaining;
@@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
base = vec->iov_base;
page_offset = offset_in_page(base);
remaining = vec->iov_len;
- while (remaining && n < nsegs) {
+ while (remaining && n < RPCRDMA_MAX_SEGS) {
seg[n].mr_page = NULL;
seg[n].mr_offset = base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
@@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
static int
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
- enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+ enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
{
- int len, n = 0, p;
- int page_base;
+ int len, n, p, page_base;
struct page **ppages;
+ n = 0;
if (pos == 0) {
- n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs);
- if (n == nsegs)
- return -EIO;
+ n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
+ if (n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
}
len = xdrbuf->page_len;
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = xdrbuf->page_base & ~PAGE_MASK;
p = 0;
- while (len && n < nsegs) {
+ while (len && n < RPCRDMA_MAX_SEGS) {
if (!ppages[p]) {
/* alloc the pagelist for receiving buffer */
ppages[p] = alloc_page(GFP_ATOMIC);
if (!ppages[p])
- return -ENOMEM;
+ return -EAGAIN;
}
seg[n].mr_page = ppages[p];
seg[n].mr_offset = (void *)(unsigned long) page_base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
if (seg[n].mr_len > PAGE_SIZE)
- return -EIO;
+ goto out_overflow;
len -= seg[n].mr_len;
++n;
++p;
@@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
}
/* Message overflows the seg array */
- if (len && n == nsegs)
- return -EIO;
+ if (len && n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
/* When encoding the read list, the tail is always sent inline */
if (type == rpcrdma_readch)
@@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* xdr pad bytes, saving the server an RDMA operation. */
if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
return n;
- n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs);
- if (n == nsegs)
- return -EIO;
+ n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
+ if (n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
}
return n;
+
+out_overflow:
+ pr_err("rpcrdma: segment array overflow\n");
+ return -EIO;
}
static inline __be32 *
-xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
{
- *iptr++ = cpu_to_be32(seg->mr_rkey);
- *iptr++ = cpu_to_be32(seg->mr_len);
- return xdr_encode_hyper(iptr, seg->mr_base);
+ *iptr++ = cpu_to_be32(mw->mw_handle);
+ *iptr++ = cpu_to_be32(mw->mw_length);
+ return xdr_encode_hyper(iptr, mw->mw_offset);
}
/* XDR-encode the Read list. Supports encoding a list of read
@@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, struct rpc_rqst *rqst,
__be32 *iptr, enum rpcrdma_chunktype rtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
unsigned int pos;
int n, nsegs;
@@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch)
pos = 0;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ seg = req->rl_segments;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ false, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
*iptr++ = xdr_one; /* item present */
@@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
* have the same "position".
*/
*iptr++ = cpu_to_be32(pos);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: read segment pos %u "
- "%d@0x%016llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__, pos,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.read_chunk_count++;
- req->rl_nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Finish Read list */
*iptr++ = xdr_zero; /* Next item not present */
@@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpc_rqst *rqst, __be32 *iptr,
enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
int n, nsegs, nchunks;
__be32 *segcount;
@@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
return iptr;
}
+ seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
- wtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: write segment "
- "%d@0x016%llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.write_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks);
@@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, struct rpc_rqst *rqst,
__be32 *iptr, enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
int n, nsegs, nchunks;
__be32 *segcount;
@@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
return iptr;
}
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ seg = req->rl_segments;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: reply segment "
- "%d@0x%016llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.reply_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Update count of segments in the Reply chunk */
*segcount = cpu_to_be32(nchunks);
@@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+ bool ddp_allowed;
ssize_t hdrlen;
size_t rpclen;
__be32 *iptr;
@@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
headerp->rm_type = rdma_msg;
+ /* When the ULP employs a GSS flavor that guarantees integrity
+ * or privacy, direct data placement of individual data items
+ * is not allowed.
+ */
+ ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
+ RPCAUTH_AUTH_DATATOUCH);
+
/*
* Chunks needed for results?
*
@@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
- else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
else
wtype = rpcrdma_replych;
@@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
rtype = rpcrdma_noch;
rpcrdma_inline_pullup(rqst);
rpclen = rqst->rq_svec[0].iov_len;
- } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+ } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
rpclen = rqst->rq_svec[0].iov_len;
rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
@@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* send a Call message with a Position Zero Read chunk and a
* regular Read chunk at the same time.
*/
- req->rl_nchunks = 0;
- req->rl_nextseg = req->rl_segments;
iptr = headerp->rm_body.rm_chunks;
iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
if (IS_ERR(iptr))
@@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
out_overflow:
pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
- /* Terminate this RPC. Chunks registered above will be
- * released by xprt_release -> xprt_rmda_free .
- */
- return -EIO;
+ iptr = ERR_PTR(-EIO);
out_unmap:
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -705,15 +711,13 @@ out_unmap:
* RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
*/
static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
{
unsigned int i, total_len;
struct rpcrdma_write_chunk *cur_wchunk;
char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
i = be32_to_cpu(**iptrp);
- if (i > max)
- return -1;
cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
total_len = 0;
while (i--) {
@@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
return total_len;
}
-/*
- * Scatter inline received data back into provided iov's.
+/**
+ * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
+ * @rqst: controlling RPC request
+ * @srcp: points to RPC message payload in receive buffer
+ * @copy_len: remaining length of receive buffer content
+ * @pad: Write chunk pad bytes needed (zero for pure inline)
+ *
+ * The upper layer has set the maximum number of bytes it can
+ * receive in each component of rq_rcv_buf. These values are set in
+ * the head.iov_len, page_len, tail.iov_len, and buflen fields.
+ *
+ * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
+ * many cases this function simply updates iov_base pointers in
+ * rq_rcv_buf to point directly to the received reply data, to
+ * avoid copying reply data.
+ *
+ * Returns the count of bytes which had to be memcopied.
*/
-static void
+static unsigned long
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
{
- int i, npages, curlen, olen;
+ unsigned long fixup_copy_count;
+ int i, npages, curlen;
char *destp;
struct page **ppages;
int page_base;
+ /* The head iovec is redirected to the RPC reply message
+ * in the receive buffer, to avoid a memcopy.
+ */
+ rqst->rq_rcv_buf.head[0].iov_base = srcp;
+ rqst->rq_private_buf.head[0].iov_base = srcp;
+
+ /* The contents of the receive buffer that follow
+ * head.iov_len bytes are copied into the page list.
+ */
curlen = rqst->rq_rcv_buf.head[0].iov_len;
- if (curlen > copy_len) { /* write chunk header fixup */
+ if (curlen > copy_len)
curlen = copy_len;
- rqst->rq_rcv_buf.head[0].iov_len = curlen;
- }
-
dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
__func__, srcp, copy_len, curlen);
-
- /* Shift pointer for first receive segment only */
- rqst->rq_rcv_buf.head[0].iov_base = srcp;
srcp += curlen;
copy_len -= curlen;
- olen = copy_len;
- i = 0;
- rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
page_base = rqst->rq_rcv_buf.page_base;
ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
page_base &= ~PAGE_MASK;
-
+ fixup_copy_count = 0;
if (copy_len && rqst->rq_rcv_buf.page_len) {
- npages = PAGE_ALIGN(page_base +
- rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
- for (; i < npages; i++) {
+ int pagelist_len;
+
+ pagelist_len = rqst->rq_rcv_buf.page_len;
+ if (pagelist_len > copy_len)
+ pagelist_len = copy_len;
+ npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
curlen = PAGE_SIZE - page_base;
- if (curlen > copy_len)
- curlen = copy_len;
+ if (curlen > pagelist_len)
+ curlen = pagelist_len;
+
dprintk("RPC: %s: page %d"
" srcp 0x%p len %d curlen %d\n",
__func__, i, srcp, copy_len, curlen);
@@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
kunmap_atomic(destp);
srcp += curlen;
copy_len -= curlen;
- if (copy_len == 0)
+ fixup_copy_count += curlen;
+ pagelist_len -= curlen;
+ if (!pagelist_len)
break;
page_base = 0;
}
- }
- if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
- curlen = copy_len;
- if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
- curlen = rqst->rq_rcv_buf.tail[0].iov_len;
- if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
- memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
- dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
- __func__, srcp, copy_len, curlen);
- rqst->rq_rcv_buf.tail[0].iov_len = curlen;
- copy_len -= curlen; ++i;
- } else
- rqst->rq_rcv_buf.tail[0].iov_len = 0;
-
- if (pad) {
- /* implicit padding on terminal chunk */
- unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
- while (pad--)
- p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+ /* Implicit padding for the last segment in a Write
+ * chunk is inserted inline at the front of the tail
+ * iovec. The upper layer ignores the content of
+ * the pad. Simply ensure inline content in the tail
+ * that follows the Write chunk is properly aligned.
+ */
+ if (pad)
+ srcp -= pad;
}
- if (copy_len)
- dprintk("RPC: %s: %d bytes in"
- " %d extra segments (%d lost)\n",
- __func__, olen, i, copy_len);
+ /* The tail iovec is redirected to the remaining data
+ * in the receive buffer, to avoid a memcopy.
+ */
+ if (copy_len || pad) {
+ rqst->rq_rcv_buf.tail[0].iov_base = srcp;
+ rqst->rq_private_buf.tail[0].iov_base = srcp;
+ }
- /* TBD avoid a warning from call_decode() */
- rqst->rq_private_buf = rqst->rq_rcv_buf;
+ return fixup_copy_count;
}
void
@@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
(headerp->rm_body.rm_chunks[1] == xdr_zero &&
headerp->rm_body.rm_chunks[2] != xdr_zero) ||
(headerp->rm_body.rm_chunks[1] != xdr_zero &&
- req->rl_nchunks == 0))
+ list_empty(&req->rl_registered)))
goto badheader;
if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
/* count any expected write chunks in read reply */
/* start at write chunk array count */
iptr = &headerp->rm_body.rm_chunks[2];
- rdmalen = rpcrdma_count_chunks(rep,
- req->rl_nchunks, 1, &iptr);
+ rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
/* check for validity, and no reply chunk after */
if (rdmalen < 0 || *iptr++ != xdr_zero)
goto badheader;
@@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
rep->rr_len -= RPCRDMA_HDRLEN_MIN;
status = rep->rr_len;
}
- /* Fix up the rpc results for upper layer */
- rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
+
+ r_xprt->rx_stats.fixup_copy_count +=
+ rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
+ rdmalen);
break;
case rdma_nomsg:
@@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
headerp->rm_body.rm_chunks[1] != xdr_zero ||
headerp->rm_body.rm_chunks[2] != xdr_one ||
- req->rl_nchunks == 0)
+ list_empty(&req->rl_registered))
goto badheader;
iptr = (__be32 *)((unsigned char *)headerp +
RPCRDMA_HDRLEN_MIN);
- rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+ rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
if (rdmalen < 0)
goto badheader;
r_xprt->rx_stats.total_rdma_reply += rdmalen;
@@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
badheader:
default:
- dprintk("%s: invalid rpcrdma reply header (type %d):"
- " chunks[012] == %d %d %d"
- " expected chunks <= %d\n",
- __func__, be32_to_cpu(headerp->rm_type),
- headerp->rm_body.rm_chunks[0],
- headerp->rm_body.rm_chunks[1],
- headerp->rm_body.rm_chunks[2],
- req->rl_nchunks);
+ dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
+ rqst->rq_task->tk_pid, __func__,
+ be32_to_cpu(headerp->rm_type));
status = -EIO;
r_xprt->rx_stats.bad_reply_count++;
break;
@@ -1035,7 +1049,7 @@ out:
* control: waking the next RPC waits until this RPC has
* relinquished all its Send Queue entries.
*/
- if (req->rl_nchunks)
+ if (!list_empty(&req->rl_registered))
r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
spin_lock_bh(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 99d2e5b72..81f0e879f 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -558,7 +558,6 @@ out_sendbuf:
out_fail:
rpcrdma_buffer_put(req);
- r_xprt->rx_stats.failed_marshal_count++;
return NULL;
}
@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
rpcrdma_buffer_put(req);
}
-/*
+/**
+ * xprt_rdma_send_request - marshal and send an RPC request
+ * @task: RPC task with an RPC message in rq_snd_buf
+ *
+ * Return values:
+ * 0: The request has been sent
+ * ENOTCONN: Caller needs to invoke connect logic then call again
+ * ENOBUFS: Call again later to send the request
+ * EIO: A permanent error occurred. The request was not sent,
+ * and don't try it again
+ *
* send_request invokes the meat of RPC RDMA. It must do the following:
+ *
* 1. Marshal the RPC request into an RPC RDMA request, which means
* putting a header in front of data, and creating IOVs for RDMA
* from those in the request.
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
* the request (rpcrdma_ep_post).
* 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
*/
-
static int
xprt_rdma_send_request(struct rpc_task *task)
{
@@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0;
+ /* On retransmit, remove any previously registered chunks */
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+
rc = rpcrdma_marshal_req(rqst);
if (rc < 0)
goto failed_marshal;
@@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task)
return 0;
failed_marshal:
- r_xprt->rx_stats.failed_marshal_count++;
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
__func__, rc);
if (rc == -EIO)
- return -EIO;
+ r_xprt->rx_stats.failed_marshal_count++;
+ if (rc != -ENOTCONN)
+ return rc;
drop_connection:
xprt_disconnect_done(xprt);
return -ENOTCONN; /* implies disconnect */
@@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
xprt->stat.bad_xids,
xprt->stat.req_u,
xprt->stat.bklog_u);
- seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
r_xprt->rx_stats.read_chunk_count,
r_xprt->rx_stats.write_chunk_count,
r_xprt->rx_stats.reply_chunk_count,
@@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.failed_marshal_count,
r_xprt->rx_stats.bad_reply_count,
r_xprt->rx_stats.nomsg_call_count);
+ seq_printf(seq, "%lu %lu %lu\n",
+ r_xprt->rx_stats.mrs_recovered,
+ r_xprt->rx_stats.mrs_orphaned,
+ r_xprt->rx_stats.mrs_allocated);
}
static int
@@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void)
__func__, rc);
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
rc = xprt_unregister_transport(&xprt_rdma_bc);
if (rc)
@@ -753,20 +769,13 @@ int xprt_rdma_init(void)
{
int rc;
- rc = frwr_alloc_recovery_wq();
- if (rc)
- return rc;
-
rc = rpcrdma_alloc_wq();
- if (rc) {
- frwr_destroy_recovery_wq();
+ if (rc)
return rc;
- }
rc = xprt_register_transport(&xprt_rdma);
if (rc) {
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
return rc;
}
@@ -774,7 +783,6 @@ int xprt_rdma_init(void)
if (rc) {
xprt_unregister_transport(&xprt_rdma);
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
return rc;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b044d98a1..799cce6cb 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/prefetch.h>
#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
#include <linux/module.h> /* try_module_get()/module_put() */
@@ -379,8 +380,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
struct rpcrdma_ia *ia = &xprt->rx_ia;
int rc;
- ia->ri_dma_mr = NULL;
-
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
@@ -391,47 +390,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_pd = ib_alloc_pd(ia->ri_device);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
- dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
- __func__, rc);
+ pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
goto out2;
}
- if (memreg == RPCRDMA_FRMR) {
- if (!(ia->ri_device->attrs.device_cap_flags &
- IB_DEVICE_MEM_MGT_EXTENSIONS) ||
- (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
- dprintk("RPC: %s: FRMR registration "
- "not supported by HCA\n", __func__);
- memreg = RPCRDMA_MTHCAFMR;
- }
- }
- if (memreg == RPCRDMA_MTHCAFMR) {
- if (!ia->ri_device->alloc_fmr) {
- dprintk("RPC: %s: MTHCAFMR registration "
- "not supported by HCA\n", __func__);
- rc = -EINVAL;
- goto out3;
- }
- }
-
switch (memreg) {
case RPCRDMA_FRMR:
- ia->ri_ops = &rpcrdma_frwr_memreg_ops;
- break;
- case RPCRDMA_ALLPHYSICAL:
- ia->ri_ops = &rpcrdma_physical_memreg_ops;
- break;
+ if (frwr_is_supported(ia)) {
+ ia->ri_ops = &rpcrdma_frwr_memreg_ops;
+ break;
+ }
+ /*FALLTHROUGH*/
case RPCRDMA_MTHCAFMR:
- ia->ri_ops = &rpcrdma_fmr_memreg_ops;
- break;
+ if (fmr_is_supported(ia)) {
+ ia->ri_ops = &rpcrdma_fmr_memreg_ops;
+ break;
+ }
+ /*FALLTHROUGH*/
default:
- printk(KERN_ERR "RPC: Unsupported memory "
- "registration mode: %d\n", memreg);
- rc = -ENOMEM;
+ pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
+ memreg);
+ rc = -EINVAL;
goto out3;
}
- dprintk("RPC: %s: memory registration strategy is '%s'\n",
- __func__, ia->ri_ops->ro_displayname);
return 0;
@@ -585,8 +566,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
out2:
ib_free_cq(sendcq);
out1:
- if (ia->ri_dma_mr)
- ib_dereg_mr(ia->ri_dma_mr);
return rc;
}
@@ -600,8 +579,6 @@ out1:
void
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
- int rc;
-
dprintk("RPC: %s: entering, connected is %d\n",
__func__, ep->rep_connected);
@@ -615,12 +592,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_free_cq(ep->rep_attr.recv_cq);
ib_free_cq(ep->rep_attr.send_cq);
-
- if (ia->ri_dma_mr) {
- rc = ib_dereg_mr(ia->ri_dma_mr);
- dprintk("RPC: %s: ib_dereg_mr returned %i\n",
- __func__, rc);
- }
}
/*
@@ -777,6 +748,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_drain_qp(ia->ri_id->qp);
}
+static void
+rpcrdma_mr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+ rb_recovery_worker.work);
+ struct rpcrdma_mw *mw;
+
+ spin_lock(&buf->rb_recovery_lock);
+ while (!list_empty(&buf->rb_stale_mrs)) {
+ mw = list_first_entry(&buf->rb_stale_mrs,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
+ spin_unlock(&buf->rb_recovery_lock);
+
+ dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
+ mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
+
+ spin_lock(&buf->rb_recovery_lock);
+ }
+ spin_unlock(&buf->rb_recovery_lock);
+}
+
+void
+rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
+{
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+
+ spin_lock(&buf->rb_recovery_lock);
+ list_add(&mw->mw_list, &buf->rb_stale_mrs);
+ spin_unlock(&buf->rb_recovery_lock);
+
+ schedule_delayed_work(&buf->rb_recovery_worker, 0);
+}
+
+static void
+rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ unsigned int count;
+ LIST_HEAD(free);
+ LIST_HEAD(all);
+
+ for (count = 0; count < 32; count++) {
+ struct rpcrdma_mw *mw;
+ int rc;
+
+ mw = kzalloc(sizeof(*mw), GFP_KERNEL);
+ if (!mw)
+ break;
+
+ rc = ia->ri_ops->ro_init_mr(ia, mw);
+ if (rc) {
+ kfree(mw);
+ break;
+ }
+
+ mw->mw_xprt = r_xprt;
+
+ list_add(&mw->mw_list, &free);
+ list_add(&mw->mw_all, &all);
+ }
+
+ spin_lock(&buf->rb_mwlock);
+ list_splice(&free, &buf->rb_mws);
+ list_splice(&all, &buf->rb_all);
+ r_xprt->rx_stats.mrs_allocated += count;
+ spin_unlock(&buf->rb_mwlock);
+
+ dprintk("RPC: %s: created %u MRs\n", __func__, count);
+}
+
+static void
+rpcrdma_mr_refresh_worker(struct work_struct *work)
+{
+ struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+ rb_refresh_worker.work);
+ struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+ rx_buf);
+
+ rpcrdma_create_mrs(r_xprt);
+}
+
struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
@@ -793,6 +848,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
spin_unlock(&buffer->rb_reqslock);
req->rl_cqe.done = rpcrdma_wc_send;
req->rl_buffer = &r_xprt->rx_buf;
+ INIT_LIST_HEAD(&req->rl_registered);
return req;
}
@@ -832,17 +888,23 @@ int
rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int i, rc;
buf->rb_max_requests = r_xprt->rx_data.max_requests;
buf->rb_bc_srv_max_requests = 0;
- spin_lock_init(&buf->rb_lock);
atomic_set(&buf->rb_credits, 1);
+ spin_lock_init(&buf->rb_mwlock);
+ spin_lock_init(&buf->rb_lock);
+ spin_lock_init(&buf->rb_recovery_lock);
+ INIT_LIST_HEAD(&buf->rb_mws);
+ INIT_LIST_HEAD(&buf->rb_all);
+ INIT_LIST_HEAD(&buf->rb_stale_mrs);
+ INIT_DELAYED_WORK(&buf->rb_refresh_worker,
+ rpcrdma_mr_refresh_worker);
+ INIT_DELAYED_WORK(&buf->rb_recovery_worker,
+ rpcrdma_mr_recovery_worker);
- rc = ia->ri_ops->ro_init(r_xprt);
- if (rc)
- goto out;
+ rpcrdma_create_mrs(r_xprt);
INIT_LIST_HEAD(&buf->rb_send_bufs);
INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -862,7 +924,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
}
INIT_LIST_HEAD(&buf->rb_recv_bufs);
- for (i = 0; i < buf->rb_max_requests + 2; i++) {
+ for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) {
struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt);
@@ -918,17 +980,46 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
kfree(req);
}
+static void
+rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+ rx_buf);
+ struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ struct rpcrdma_mw *mw;
+ unsigned int count;
+
+ count = 0;
+ spin_lock(&buf->rb_mwlock);
+ while (!list_empty(&buf->rb_all)) {
+ mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&mw->mw_all);
+
+ spin_unlock(&buf->rb_mwlock);
+ ia->ri_ops->ro_release_mr(mw);
+ count++;
+ spin_lock(&buf->rb_mwlock);
+ }
+ spin_unlock(&buf->rb_mwlock);
+ r_xprt->rx_stats.mrs_allocated = 0;
+
+ dprintk("RPC: %s: released %u MRs\n", __func__, count);
+}
+
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ cancel_delayed_work_sync(&buf->rb_recovery_worker);
+
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
rep = rpcrdma_buffer_get_rep_locked(buf);
rpcrdma_destroy_rep(ia, rep);
}
+ buf->rb_send_count = 0;
spin_lock(&buf->rb_reqslock);
while (!list_empty(&buf->rb_allreqs)) {
@@ -943,8 +1034,9 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
spin_lock(&buf->rb_reqslock);
}
spin_unlock(&buf->rb_reqslock);
+ buf->rb_recv_count = 0;
- ia->ri_ops->ro_destroy(buf);
+ rpcrdma_destroy_mrs(buf);
}
struct rpcrdma_mw *
@@ -962,8 +1054,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
spin_unlock(&buf->rb_mwlock);
if (!mw)
- pr_err("RPC: %s: no MWs available\n", __func__);
+ goto out_nomws;
return mw;
+
+out_nomws:
+ dprintk("RPC: %s: no MWs available\n", __func__);
+ schedule_delayed_work(&buf->rb_refresh_worker, 0);
+
+ /* Allow the reply handler and refresh worker to run */
+ cond_resched();
+
+ return NULL;
}
void
@@ -976,6 +1077,23 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
spin_unlock(&buf->rb_mwlock);
}
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
+{
+ /* If an RPC previously completed without a reply (say, a
+ * credential problem or a soft timeout occurs) then hold off
+ * on supplying more Receive buffers until the number of new
+ * pending RPCs catches up to the number of posted Receives.
+ */
+ if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
+ return NULL;
+
+ if (unlikely(list_empty(&buffers->rb_recv_bufs)))
+ return NULL;
+ buffers->rb_recv_count++;
+ return rpcrdma_buffer_get_rep_locked(buffers);
+}
+
/*
* Get a set of request/reply buffers.
*
@@ -989,10 +1107,9 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
spin_lock(&buffers->rb_lock);
if (list_empty(&buffers->rb_send_bufs))
goto out_reqbuf;
+ buffers->rb_send_count++;
req = rpcrdma_buffer_get_req_locked(buffers);
- if (list_empty(&buffers->rb_recv_bufs))
- goto out_repbuf;
- req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ req->rl_reply = rpcrdma_buffer_get_rep(buffers);
spin_unlock(&buffers->rb_lock);
return req;
@@ -1000,11 +1117,6 @@ out_reqbuf:
spin_unlock(&buffers->rb_lock);
pr_warn("RPC: %s: out of request buffers\n", __func__);
return NULL;
-out_repbuf:
- spin_unlock(&buffers->rb_lock);
- pr_warn("RPC: %s: out of reply buffers\n", __func__);
- req->rl_reply = NULL;
- return req;
}
/*
@@ -1021,9 +1133,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
+ buffers->rb_send_count--;
list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
- if (rep)
+ if (rep) {
+ buffers->rb_recv_count--;
list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ }
spin_unlock(&buffers->rb_lock);
}
@@ -1037,8 +1152,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
struct rpcrdma_buffer *buffers = req->rl_buffer;
spin_lock(&buffers->rb_lock);
- if (!list_empty(&buffers->rb_recv_bufs))
- req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ req->rl_reply = rpcrdma_buffer_get_rep(buffers);
spin_unlock(&buffers->rb_lock);
}
@@ -1052,6 +1166,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
spin_lock(&buffers->rb_lock);
+ buffers->rb_recv_count--;
list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
spin_unlock(&buffers->rb_lock);
}
@@ -1060,14 +1175,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
* Wrappers for internal-use kmalloc memory registration, used by buffer code.
*/
-void
-rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
-{
- dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
- seg->mr_offset,
- (unsigned long long)seg->mr_dma, seg->mr_dmalen);
-}
-
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
@@ -1150,7 +1257,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
if (rc)
- goto out;
+ return rc;
req->rl_reply = NULL;
}
@@ -1175,10 +1282,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
if (rc)
- dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
- rc);
-out:
- return rc;
+ goto out_postsend_err;
+ return 0;
+
+out_postsend_err:
+ pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
+ return -ENOTCONN;
}
/*
@@ -1203,11 +1312,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
DMA_BIDIRECTIONAL);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
-
if (rc)
- dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
- rc);
- return rc;
+ goto out_postrecv;
+ return 0;
+
+out_postrecv:
+ pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
+ return -ENOTCONN;
}
/**
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c53abd128..a71b0f589 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -68,7 +68,6 @@ struct rpcrdma_ia {
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
- struct ib_mr *ri_dma_mr;
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
@@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* o recv buffer (posted to provider)
* o ib_sge (also donated to provider)
* o status of reply (length, success or not)
- * o bookkeeping state to get run by tasklet (list, etc)
+ * o bookkeeping state to get run by reply handler (list, etc)
*
- * These are allocated during initialization, per-transport instance;
- * however, the tasklet execution list itself is global, as it should
- * always be pretty short.
+ * These are allocated during initialization, per-transport instance.
*
* N of these are associated with a transport instance, and stored in
* struct rpcrdma_buffer. N is the max number of outstanding requests.
*/
-#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
-
-/* data segments + head/tail for Call + head/tail for Reply */
-#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
-
-struct rpcrdma_buffer;
-
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
unsigned int rr_len;
@@ -232,8 +222,8 @@ struct rpcrdma_frmr {
};
struct rpcrdma_fmr {
- struct ib_fmr *fmr;
- u64 *physaddrs;
+ struct ib_fmr *fm_mr;
+ u64 *fm_physaddrs;
};
struct rpcrdma_mw {
@@ -245,8 +235,10 @@ struct rpcrdma_mw {
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
};
- struct work_struct mw_work;
struct rpcrdma_xprt *mw_xprt;
+ u32 mw_handle;
+ u32 mw_length;
+ u64 mw_offset;
struct list_head mw_all;
};
@@ -266,33 +258,30 @@ struct rpcrdma_mw {
* of iovs for send operations. The reason is that the iovs passed to
* ib_post_{send,recv} must not be modified until the work request
* completes.
- *
- * NOTES:
- * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
- * marshal. The number needed varies depending on the iov lists that
- * are passed to us, the memory registration mode we are in, and if
- * physical addressing is used, the layout.
*/
+/* Maximum number of page-sized "segments" per chunk list to be
+ * registered or invalidated. Must handle a Reply chunk:
+ */
+enum {
+ RPCRDMA_MAX_IOV_SEGS = 3,
+ RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
+ RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS +
+ RPCRDMA_MAX_IOV_SEGS,
+};
+
struct rpcrdma_mr_seg { /* chunk descriptors */
- struct rpcrdma_mw *rl_mw; /* registered MR */
- u64 mr_base; /* registration result */
- u32 mr_rkey; /* registration result */
u32 mr_len; /* length of chunk or segment */
- int mr_nsegs; /* number of segments in chunk or 0 */
- enum dma_data_direction mr_dir; /* segment mapping direction */
- dma_addr_t mr_dma; /* segment mapping address */
- size_t mr_dmalen; /* segment mapping length */
struct page *mr_page; /* owning page, if any */
char *mr_offset; /* kva if no page, else offset */
};
#define RPCRDMA_MAX_IOVS (2)
+struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_free;
unsigned int rl_niovs;
- unsigned int rl_nchunks;
unsigned int rl_connect_cookie;
struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
@@ -300,12 +289,13 @@ struct rpcrdma_req {
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
- struct rpcrdma_mr_seg *rl_nextseg;
struct ib_cqe rl_cqe;
struct list_head rl_all;
bool rl_backchannel;
+
+ struct list_head rl_registered; /* registered segments */
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
static inline struct rpcrdma_req *
@@ -331,6 +321,7 @@ struct rpcrdma_buffer {
char *rb_pool;
spinlock_t rb_lock; /* protect buf lists */
+ int rb_send_count, rb_recv_count;
struct list_head rb_send_bufs;
struct list_head rb_recv_bufs;
u32 rb_max_requests;
@@ -341,6 +332,11 @@ struct rpcrdma_buffer {
struct list_head rb_allreqs;
u32 rb_bc_max_requests;
+
+ spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */
+ struct list_head rb_stale_mrs;
+ struct delayed_work rb_recovery_worker;
+ struct delayed_work rb_refresh_worker;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -387,6 +383,9 @@ struct rpcrdma_stats {
unsigned long bad_reply_count;
unsigned long nomsg_call_count;
unsigned long bcall_count;
+ unsigned long mrs_recovered;
+ unsigned long mrs_orphaned;
+ unsigned long mrs_allocated;
};
/*
@@ -395,23 +394,25 @@ struct rpcrdma_stats {
struct rpcrdma_xprt;
struct rpcrdma_memreg_ops {
int (*ro_map)(struct rpcrdma_xprt *,
- struct rpcrdma_mr_seg *, int, bool);
+ struct rpcrdma_mr_seg *, int, bool,
+ struct rpcrdma_mw **);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
struct rpcrdma_req *);
void (*ro_unmap_safe)(struct rpcrdma_xprt *,
struct rpcrdma_req *, bool);
+ void (*ro_recover_mr)(struct rpcrdma_mw *);
int (*ro_open)(struct rpcrdma_ia *,
struct rpcrdma_ep *,
struct rpcrdma_create_data_internal *);
size_t (*ro_maxpages)(struct rpcrdma_xprt *);
- int (*ro_init)(struct rpcrdma_xprt *);
- void (*ro_destroy)(struct rpcrdma_buffer *);
+ int (*ro_init_mr)(struct rpcrdma_ia *,
+ struct rpcrdma_mw *);
+ void (*ro_release_mr)(struct rpcrdma_mw *);
const char *ro_displayname;
};
extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
-extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
/*
* RPCRDMA transport -- encapsulates the structures above for
@@ -446,6 +447,8 @@ extern int xprt_rdma_pad_optimize;
*/
int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
void rpcrdma_ia_close(struct rpcrdma_ia *);
+bool frwr_is_supported(struct rpcrdma_ia *);
+bool fmr_is_supported(struct rpcrdma_ia *);
/*
* Endpoint calls - xprtrdma/verbs.c
@@ -477,6 +480,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
+
struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
size_t, gfp_t);
void rpcrdma_free_regbuf(struct rpcrdma_ia *,
@@ -484,9 +489,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
-int frwr_alloc_recovery_wq(void);
-void frwr_destroy_recovery_wq(void);
-
int rpcrdma_alloc_wq(void);
void rpcrdma_destroy_wq(void);
@@ -494,45 +496,12 @@ void rpcrdma_destroy_wq(void);
* Wrappers for chunk registration, shared by read/write chunk code.
*/
-void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
-
static inline enum dma_data_direction
rpcrdma_data_dir(bool writing)
{
return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
}
-static inline void
-rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
- enum dma_data_direction direction)
-{
- seg->mr_dir = direction;
- seg->mr_dmalen = seg->mr_len;
-
- if (seg->mr_page)
- seg->mr_dma = ib_dma_map_page(device,
- seg->mr_page, offset_in_page(seg->mr_offset),
- seg->mr_dmalen, seg->mr_dir);
- else
- seg->mr_dma = ib_dma_map_single(device,
- seg->mr_offset,
- seg->mr_dmalen, seg->mr_dir);
-
- if (ib_dma_mapping_error(device, seg->mr_dma))
- rpcrdma_mapping_error(seg);
-}
-
-static inline void
-rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
-{
- if (seg->mr_page)
- ib_dma_unmap_page(device,
- seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
- else
- ib_dma_unmap_single(device,
- seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-}
-
/*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
*/
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 167cf5931..bf168838a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &xprt_min_resvport_limit,
- .extra2 = &xprt_max_resvport_limit
+ .extra2 = &xprt_max_resvport
},
{
.procname = "max_resvport",
@@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &xprt_min_resvport_limit,
+ .extra1 = &xprt_min_resvport,
.extra2 = &xprt_max_resvport_limit
},
{
@@ -177,7 +177,6 @@ static struct ctl_table sunrpc_table[] = {
* increase over time if the server is down or not responding.
*/
#define XS_TCP_INIT_REEST_TO (3U * HZ)
-#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ)
/*
* TCP idle timeout; client drops the transport socket if it is idle
@@ -642,6 +641,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
bool zerocopy = true;
+ bool vm_wait = false;
int status;
int sent;
@@ -677,15 +677,33 @@ static int xs_tcp_send_request(struct rpc_task *task)
return 0;
}
+ WARN_ON_ONCE(sent == 0 && status == 0);
+
+ if (status == -EAGAIN ) {
+ /*
+ * Return EAGAIN if we're sure we're hitting the
+ * socket send buffer limits.
+ */
+ if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
+ break;
+ /*
+ * Did we hit a memory allocation failure?
+ */
+ if (sent == 0) {
+ status = -ENOBUFS;
+ if (vm_wait)
+ break;
+ /* Retry, knowing now that we're below the
+ * socket send buffer limit
+ */
+ vm_wait = true;
+ }
+ continue;
+ }
if (status < 0)
break;
- if (sent == 0) {
- status = -EAGAIN;
- break;
- }
+ vm_wait = false;
}
- if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
- status = -ENOBUFS;
switch (status) {
case -ENOTSOCK:
@@ -755,11 +773,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
sk->sk_error_report = transport->old_error_report;
}
+static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+}
+
static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
{
smp_mb__before_atomic();
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
clear_bit(XPRT_CLOSING, &xprt->state);
+ xs_sock_reset_state_flags(xprt);
smp_mb__after_atomic();
}
@@ -962,10 +988,13 @@ static void xs_local_data_receive(struct sock_xprt *transport)
goto out;
for (;;) {
skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb == NULL)
+ if (skb != NULL) {
+ xs_local_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ continue;
+ }
+ if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break;
- xs_local_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
}
out:
mutex_unlock(&transport->recv_mutex);
@@ -1043,10 +1072,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
goto out;
for (;;) {
skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb == NULL)
+ if (skb != NULL) {
+ xs_udp_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram_locked(sk, skb);
+ continue;
+ }
+ if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break;
- xs_udp_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
}
out:
mutex_unlock(&transport->recv_mutex);
@@ -1074,7 +1106,14 @@ static void xs_data_ready(struct sock *sk)
if (xprt != NULL) {
struct sock_xprt *transport = container_of(xprt,
struct sock_xprt, xprt);
- queue_work(rpciod_workqueue, &transport->recv_worker);
+ transport->old_data_ready(sk);
+ /* Any data means we had a useful conversation, so
+ * then we don't need to delay the next reconnect
+ */
+ if (xprt->reestablish_timeout)
+ xprt->reestablish_timeout = 0;
+ if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ queue_work(xprtiod_workqueue, &transport->recv_worker);
}
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1474,10 +1513,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
for (;;) {
lock_sock(sk);
read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
- release_sock(sk);
- if (read <= 0)
- break;
- total += read;
+ if (read <= 0) {
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+ release_sock(sk);
+ if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ break;
+ } else {
+ release_sock(sk);
+ total += read;
+ }
rd_desc.count = 65536;
}
out:
@@ -1493,34 +1537,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work)
}
/**
- * xs_tcp_data_ready - "data ready" callback for TCP sockets
- * @sk: socket with data to read
- *
- */
-static void xs_tcp_data_ready(struct sock *sk)
-{
- struct sock_xprt *transport;
- struct rpc_xprt *xprt;
-
- dprintk("RPC: xs_tcp_data_ready...\n");
-
- read_lock_bh(&sk->sk_callback_lock);
- if (!(xprt = xprt_from_sock(sk)))
- goto out;
- transport = container_of(xprt, struct sock_xprt, xprt);
-
- /* Any data means we had a useful conversation, so
- * the we don't need to delay the next reconnect
- */
- if (xprt->reestablish_timeout)
- xprt->reestablish_timeout = 0;
- queue_work(rpciod_workqueue, &transport->recv_worker);
-
-out:
- read_unlock_bh(&sk->sk_callback_lock);
-}
-
-/**
* xs_tcp_state_change - callback to handle TCP socket state changes
* @sk: socket whose state has changed
*
@@ -1714,7 +1730,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
static unsigned short xs_get_random_port(void)
{
- unsigned short range = xprt_max_resvport - xprt_min_resvport;
+ unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
unsigned short rand = (unsigned short) prandom_u32() % range;
return rand + xprt_min_resvport;
}
@@ -2156,6 +2172,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
write_unlock_bh(&sk->sk_callback_lock);
}
xs_udp_do_set_buffer_size(xprt);
+
+ xprt->stat.connect_start = jiffies;
}
static void xs_udp_setup_socket(struct work_struct *work)
@@ -2219,6 +2237,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
unsigned int keepcnt = xprt->timeout->to_retries + 1;
unsigned int opt_on = 1;
unsigned int timeo;
+ unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
/* TCP Keepalive options */
kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2230,6 +2249,16 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
(char *)&keepcnt, sizeof(keepcnt));
+ /* Avoid temporary address, they are bad for long-lived
+ * connections such as NFS mounts.
+ * RFC4941, section 3.6 suggests that:
+ * Individual applications, which have specific
+ * knowledge about the normal duration of connections,
+ * MAY override this as appropriate.
+ */
+ kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
+ (char *)&addr_pref, sizeof(addr_pref));
+
/* TCP user timeout (see RFC5482) */
timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
(xprt->timeout->to_retries + 1);
@@ -2241,7 +2270,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
- sk->sk_data_ready = xs_tcp_data_ready;
+ sk->sk_data_ready = xs_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
sock_set_flag(sk, SOCK_FASYNC);
@@ -2356,6 +2385,25 @@ out:
xprt_wake_pending_tasks(xprt, status);
}
+static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
+{
+ unsigned long start, now = jiffies;
+
+ start = xprt->stat.connect_start + xprt->reestablish_timeout;
+ if (time_after(start, now))
+ return start - now;
+ return 0;
+}
+
+static void xs_reconnect_backoff(struct rpc_xprt *xprt)
+{
+ xprt->reestablish_timeout <<= 1;
+ if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
+ xprt->reestablish_timeout = xprt->max_reconnect_timeout;
+ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+}
+
/**
* xs_connect - connect a socket to a remote endpoint
* @xprt: pointer to transport structure
@@ -2373,6 +2421,7 @@ out:
static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ unsigned long delay = 0;
WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
@@ -2384,19 +2433,15 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
/* Start by resetting any existing state */
xs_reset_transport(transport);
- queue_delayed_work(rpciod_workqueue,
- &transport->connect_worker,
- xprt->reestablish_timeout);
- xprt->reestablish_timeout <<= 1;
- if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
- xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
- if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
- xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
- } else {
+ delay = xs_reconnect_delay(xprt);
+ xs_reconnect_backoff(xprt);
+
+ } else
dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
- queue_delayed_work(rpciod_workqueue,
- &transport->connect_worker, 0);
- }
+
+ queue_delayed_work(xprtiod_workqueue,
+ &transport->connect_worker,
+ delay);
}
/**
@@ -2948,6 +2993,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->ops = &xs_tcp_ops;
xprt->timeout = &xs_tcp_default_timeout;
+ xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
+
INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
@@ -3157,8 +3204,12 @@ static int param_set_uint_minmax(const char *val,
static int param_set_portnr(const char *val, const struct kernel_param *kp)
{
- return param_set_uint_minmax(val, kp,
+ if (kp->arg == &xprt_min_resvport)
+ return param_set_uint_minmax(val, kp,
RPC_MIN_RESVPORT,
+ xprt_max_resvport);
+ return param_set_uint_minmax(val, kp,
+ xprt_min_resvport,
RPC_MAX_RESVPORT);
}
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 59658b2e9..a5fc9dd24 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -1286,8 +1286,8 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi)
}
EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
-static bool switchdev_port_same_parent_id(struct net_device *a,
- struct net_device *b)
+bool switchdev_port_same_parent_id(struct net_device *a,
+ struct net_device *b)
{
struct switchdev_attr a_attr = {
.orig_dev = a,
@@ -1323,6 +1323,7 @@ static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
return dev->ifindex;
}
+EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id);
static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
u32 old_mark, u32 *reset_mark)
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
index 57e460be4..31b9f9c52 100644
--- a/net/tipc/Makefile
+++ b/net/tipc/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_TIPC) := tipc.o
tipc-y += addr.o bcast.o bearer.o \
core.o link.o discover.o msg.o \
- name_distr.o subscr.o name_table.o net.o \
+ name_distr.o subscr.o monitor.o name_table.o net.o \
netlink.o netlink_compat.o node.o socket.o eth_media.o \
server.o socket.o
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 93f7c983b..bebb34780 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -43,9 +43,6 @@
#include <net/netns/generic.h>
#include "core.h"
-#define TIPC_ZONE_MASK 0xff000000u
-#define TIPC_CLUSTER_MASK 0xfffff000u
-
static inline u32 tipc_own_addr(struct net *net)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
@@ -60,7 +57,7 @@ static inline u32 tipc_zone_mask(u32 addr)
static inline u32 tipc_cluster_mask(u32 addr)
{
- return addr & TIPC_CLUSTER_MASK;
+ return addr & TIPC_ZONE_CLUSTER_MASK;
}
u32 tipc_own_addr(struct net *net);
@@ -73,4 +70,5 @@ int tipc_addr_node_valid(u32 addr);
int tipc_in_scope(u32 domain, u32 addr);
int tipc_addr_scope(u32 domain);
char *tipc_addr_string_fill(char *string, u32 addr);
+
#endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a597708ae..65b1bbf13 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -1,7 +1,7 @@
/*
* net/tipc/bearer.c: TIPC bearer code
*
- * Copyright (c) 1996-2006, 2013-2014, Ericsson AB
+ * Copyright (c) 1996-2006, 2013-2016, Ericsson AB
* Copyright (c) 2004-2006, 2010-2013, Wind River Systems
* All rights reserved.
*
@@ -39,6 +39,7 @@
#include "bearer.h"
#include "link.h"
#include "discover.h"
+#include "monitor.h"
#include "bcast.h"
#include "netlink.h"
@@ -170,6 +171,27 @@ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name)
return NULL;
}
+/* tipc_bearer_get_name - get the bearer name from its id.
+ * @net: network namespace
+ * @name: a pointer to the buffer where the name will be stored.
+ * @bearer_id: the id to get the name from.
+ */
+int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_bearer *b;
+
+ if (bearer_id >= MAX_BEARERS)
+ return -EINVAL;
+
+ b = rtnl_dereference(tn->bearer_list[bearer_id]);
+ if (!b)
+ return -EINVAL;
+
+ strcpy(name, b->name);
+ return 0;
+}
+
void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
@@ -224,7 +246,7 @@ static int tipc_enable_bearer(struct net *net, const char *name,
if (tipc_addr_domain_valid(disc_domain) &&
(disc_domain != tn->own_addr)) {
if (tipc_in_scope(disc_domain, tn->own_addr)) {
- disc_domain = tn->own_addr & TIPC_CLUSTER_MASK;
+ disc_domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK;
res = 0; /* accept any node in own cluster */
} else if (in_own_cluster_exact(net, disc_domain))
res = 0; /* accept specified node in own cluster */
@@ -313,6 +335,10 @@ restart:
rcu_assign_pointer(tn->bearer_list[bearer_id], b);
if (skb)
tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr);
+
+ if (tipc_mon_create(net, bearer_id))
+ return -ENOMEM;
+
pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
name,
tipc_addr_string_fill(addr_string, disc_domain), priority);
@@ -363,6 +389,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b)
tipc_disc_delete(b->link_req);
RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL);
kfree_rcu(b, rcu);
+ tipc_mon_delete(net, bearer_id);
}
int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
@@ -826,7 +853,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info)
u32 prio;
prio = TIPC_MEDIA_LINK_PRI;
- domain = tn->own_addr & TIPC_CLUSTER_MASK;
+ domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK;
if (!info->attrs[TIPC_NLA_BEARER])
return -EINVAL;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 60e49c3be..43757f1f9 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -1,7 +1,7 @@
/*
* net/tipc/bearer.h: Include file for TIPC bearer code
*
- * Copyright (c) 1996-2006, 2013-2014, Ericsson AB
+ * Copyright (c) 1996-2006, 2013-2016, Ericsson AB
* Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
@@ -197,6 +197,7 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *buf,
void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest);
void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest);
struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name);
+int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id);
struct tipc_media *tipc_media_find(const char *name);
void tipc_bearer_reset_all(struct net *net);
int tipc_bearer_setup(void);
diff --git a/net/tipc/core.c b/net/tipc/core.c
index fe1b062c4..236b043a4 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -57,6 +57,7 @@ static int __net_init tipc_init_net(struct net *net)
tn->net_id = 4711;
tn->own_addr = 0;
+ tn->mon_threshold = TIPC_DEF_MON_THRESHOLD;
get_random_bytes(&tn->random, sizeof(int));
INIT_LIST_HEAD(&tn->node_list);
spin_lock_init(&tn->node_list_lock);
diff --git a/net/tipc/core.h b/net/tipc/core.h
index eff58dc53..a1845fb27 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -66,11 +66,13 @@ struct tipc_bc_base;
struct tipc_link;
struct tipc_name_table;
struct tipc_server;
+struct tipc_monitor;
#define TIPC_MOD_VER "2.0.0"
-#define NODE_HTABLE_SIZE 512
-#define MAX_BEARERS 3
+#define NODE_HTABLE_SIZE 512
+#define MAX_BEARERS 3
+#define TIPC_DEF_MON_THRESHOLD 32
extern int tipc_net_id __read_mostly;
extern int sysctl_tipc_rmem[3] __read_mostly;
@@ -88,6 +90,10 @@ struct tipc_net {
u32 num_nodes;
u32 num_links;
+ /* Neighbor monitoring list */
+ struct tipc_monitor *monitors[MAX_BEARERS];
+ int mon_threshold;
+
/* Bearer list */
struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1];
@@ -126,6 +132,11 @@ static inline struct list_head *tipc_nodes(struct net *net)
return &tipc_net(net)->node_list;
}
+static inline unsigned int tipc_hashfn(u32 addr)
+{
+ return addr & (NODE_HTABLE_SIZE - 1);
+}
+
static inline u16 mod(u16 x)
{
return x & 0xffffu;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index ad9d477cc..6b109a808 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -135,9 +135,12 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
u16 caps = msg_node_capabilities(hdr);
bool respond = false;
bool dupl_addr = false;
+ int err;
- bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr));
+ err = bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr));
kfree_skb(skb);
+ if (err)
+ return;
/* Ensure message from node is valid and communication is permitted */
if (net_id != tn->net_id)
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 7d89f8713..877d94f34 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -42,6 +42,7 @@
#include "name_distr.h"
#include "discover.h"
#include "netlink.h"
+#include "monitor.h"
#include <linux/pkt_sched.h>
@@ -87,7 +88,6 @@ struct tipc_stats {
* @peer_bearer_id: bearer id used by link's peer endpoint
* @bearer_id: local bearer id used by link
* @tolerance: minimum link continuity loss needed to reset link [in ms]
- * @keepalive_intv: link keepalive timer interval
* @abort_limit: # of unacknowledged continuity probes needed to reset link
* @state: current state of link FSM
* @peer_caps: bitmap describing capabilities of peer node
@@ -96,6 +96,7 @@ struct tipc_stats {
* @pmsg: convenience pointer to "proto_msg" field
* @priority: current link priority
* @net_plane: current link network plane ('A' through 'H')
+ * @mon_state: cookie with information needed by link monitor
* @backlog_limit: backlog queue congestion thresholds (indexed by importance)
* @exp_msg_count: # of tunnelled messages expected during link changeover
* @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset
@@ -131,7 +132,6 @@ struct tipc_link {
u32 peer_bearer_id;
u32 bearer_id;
u32 tolerance;
- unsigned long keepalive_intv;
u32 abort_limit;
u32 state;
u16 peer_caps;
@@ -140,6 +140,7 @@ struct tipc_link {
char if_name[TIPC_MAX_IF_NAME];
u32 priority;
char net_plane;
+ struct tipc_mon_state mon_state;
u16 rst_cnt;
/* Failover/synch */
@@ -713,18 +714,25 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
bool setup = false;
u16 bc_snt = l->bc_sndlink->snd_nxt - 1;
u16 bc_acked = l->bc_rcvlink->acked;
-
- link_profile_stats(l);
+ struct tipc_mon_state *mstate = &l->mon_state;
switch (l->state) {
case LINK_ESTABLISHED:
case LINK_SYNCHING:
- if (l->silent_intv_cnt > l->abort_limit)
- return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
mtyp = STATE_MSG;
+ link_profile_stats(l);
+ tipc_mon_get_state(l->net, l->addr, mstate, l->bearer_id);
+ if (mstate->reset || (l->silent_intv_cnt > l->abort_limit))
+ return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
state = bc_acked != bc_snt;
- probe = l->silent_intv_cnt;
- l->silent_intv_cnt++;
+ state |= l->bc_rcvlink->rcv_unacked;
+ state |= l->rcv_unacked;
+ state |= !skb_queue_empty(&l->transmq);
+ state |= !skb_queue_empty(&l->deferdq);
+ probe = mstate->probing;
+ probe |= l->silent_intv_cnt;
+ if (probe || mstate->monitoring)
+ l->silent_intv_cnt++;
break;
case LINK_RESET:
setup = l->rst_cnt++ <= 4;
@@ -835,6 +843,7 @@ void tipc_link_reset(struct tipc_link *l)
l->stats.recv_info = 0;
l->stale_count = 0;
l->bc_peer_is_up = false;
+ memset(&l->mon_state, 0, sizeof(l->mon_state));
tipc_link_reset_stats(l);
}
@@ -1243,6 +1252,9 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
struct tipc_msg *hdr;
struct sk_buff_head *dfq = &l->deferdq;
bool node_up = link_is_up(l->bc_rcvlink);
+ struct tipc_mon_state *mstate = &l->mon_state;
+ int dlen = 0;
+ void *data;
/* Don't send protocol message during reset or link failover */
if (tipc_link_is_blocked(l))
@@ -1255,12 +1267,13 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE,
- TIPC_MAX_IF_NAME, l->addr,
+ tipc_max_domain_size, l->addr,
tipc_own_addr(l->net), 0, 0, 0);
if (!skb)
return;
hdr = buf_msg(skb);
+ data = msg_data(hdr);
msg_set_session(hdr, l->session);
msg_set_bearer_id(hdr, l->bearer_id);
msg_set_net_plane(hdr, l->net_plane);
@@ -1276,14 +1289,18 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
if (mtyp == STATE_MSG) {
msg_set_seq_gap(hdr, rcvgap);
- msg_set_size(hdr, INT_H_SIZE);
msg_set_probe(hdr, probe);
+ tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id);
+ msg_set_size(hdr, INT_H_SIZE + dlen);
+ skb_trim(skb, INT_H_SIZE + dlen);
l->stats.sent_states++;
l->rcv_unacked = 0;
} else {
/* RESET_MSG or ACTIVATE_MSG */
msg_set_max_pkt(hdr, l->advertised_mtu);
- strcpy(msg_data(hdr), l->if_name);
+ strcpy(data, l->if_name);
+ msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME);
+ skb_trim(skb, INT_H_SIZE + TIPC_MAX_IF_NAME);
}
if (probe)
l->stats.sent_probes++;
@@ -1376,7 +1393,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
u16 peers_tol = msg_link_tolerance(hdr);
u16 peers_prio = msg_linkprio(hdr);
u16 rcv_nxt = l->rcv_nxt;
+ u16 dlen = msg_data_sz(hdr);
int mtyp = msg_type(hdr);
+ void *data;
char *if_name;
int rc = 0;
@@ -1386,6 +1405,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
if (tipc_own_addr(l->net) > msg_prevnode(hdr))
l->net_plane = msg_net_plane(hdr);
+ skb_linearize(skb);
+ hdr = buf_msg(skb);
+ data = msg_data(hdr);
+
switch (mtyp) {
case RESET_MSG:
@@ -1396,8 +1419,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
/* fall thru' */
case ACTIVATE_MSG:
- skb_linearize(skb);
- hdr = buf_msg(skb);
/* Complete own link name with peer's interface name */
if_name = strrchr(l->name, ':') + 1;
@@ -1405,7 +1426,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
break;
if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME)
break;
- strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME);
+ strncpy(if_name, data, TIPC_MAX_IF_NAME);
/* Update own tolerance if peer indicates a non-zero value */
if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
@@ -1453,6 +1474,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
rc = TIPC_LINK_UP_EVT;
break;
}
+ tipc_mon_rcv(l->net, data, dlen, l->addr,
+ &l->mon_state, l->bearer_id);
/* Send NACK if peer has sent pkts we haven't received yet */
if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l))
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
new file mode 100644
index 000000000..ed97a5876
--- /dev/null
+++ b/net/tipc/monitor.c
@@ -0,0 +1,804 @@
+/*
+ * net/tipc/monitor.c
+ *
+ * Copyright (c) 2016, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <net/genetlink.h>
+#include "core.h"
+#include "addr.h"
+#include "monitor.h"
+#include "bearer.h"
+
+#define MAX_MON_DOMAIN 64
+#define MON_TIMEOUT 120000
+#define MAX_PEER_DOWN_EVENTS 4
+
+/* struct tipc_mon_domain: domain record to be transferred between peers
+ * @len: actual size of domain record
+ * @gen: current generation of sender's domain
+ * @ack_gen: most recent generation of self's domain acked by peer
+ * @member_cnt: number of domain member nodes described in this record
+ * @up_map: bit map indicating which of the members the sender considers up
+ * @members: identity of the domain members
+ */
+struct tipc_mon_domain {
+ u16 len;
+ u16 gen;
+ u16 ack_gen;
+ u16 member_cnt;
+ u64 up_map;
+ u32 members[MAX_MON_DOMAIN];
+};
+
+/* struct tipc_peer: state of a peer node and its domain
+ * @addr: tipc node identity of peer
+ * @head_map: shows which other nodes currently consider peer 'up'
+ * @domain: most recent domain record from peer
+ * @hash: position in hashed lookup list
+ * @list: position in linked list, in circular ascending order by 'addr'
+ * @applied: number of reported domain members applied on this monitor list
+ * @is_up: peer is up as seen from this node
+ * @is_head: peer is assigned domain head as seen from this node
+ * @is_local: peer is in local domain and should be continuously monitored
+ * @down_cnt: - numbers of other peers which have reported this on lost
+ */
+struct tipc_peer {
+ u32 addr;
+ struct tipc_mon_domain *domain;
+ struct hlist_node hash;
+ struct list_head list;
+ u8 applied;
+ u8 down_cnt;
+ bool is_up;
+ bool is_head;
+ bool is_local;
+};
+
+struct tipc_monitor {
+ struct hlist_head peers[NODE_HTABLE_SIZE];
+ int peer_cnt;
+ struct tipc_peer *self;
+ rwlock_t lock;
+ struct tipc_mon_domain cache;
+ u16 list_gen;
+ u16 dom_gen;
+ struct net *net;
+ struct timer_list timer;
+ unsigned long timer_intv;
+};
+
+static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id)
+{
+ return tipc_net(net)->monitors[bearer_id];
+}
+
+const int tipc_max_domain_size = sizeof(struct tipc_mon_domain);
+
+/* dom_rec_len(): actual length of domain record for transport
+ */
+static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt)
+{
+ return ((void *)&dom->members - (void *)dom) + (mcnt * sizeof(u32));
+}
+
+/* dom_size() : calculate size of own domain based on number of peers
+ */
+static int dom_size(int peers)
+{
+ int i = 0;
+
+ while ((i * i) < peers)
+ i++;
+ return i < MAX_MON_DOMAIN ? i : MAX_MON_DOMAIN;
+}
+
+static void map_set(u64 *up_map, int i, unsigned int v)
+{
+ *up_map &= ~(1ULL << i);
+ *up_map |= ((u64)v << i);
+}
+
+static int map_get(u64 up_map, int i)
+{
+ return (up_map & (1 << i)) >> i;
+}
+
+static struct tipc_peer *peer_prev(struct tipc_peer *peer)
+{
+ return list_last_entry(&peer->list, struct tipc_peer, list);
+}
+
+static struct tipc_peer *peer_nxt(struct tipc_peer *peer)
+{
+ return list_first_entry(&peer->list, struct tipc_peer, list);
+}
+
+static struct tipc_peer *peer_head(struct tipc_peer *peer)
+{
+ while (!peer->is_head)
+ peer = peer_prev(peer);
+ return peer;
+}
+
+static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr)
+{
+ struct tipc_peer *peer;
+ unsigned int thash = tipc_hashfn(addr);
+
+ hlist_for_each_entry(peer, &mon->peers[thash], hash) {
+ if (peer->addr == addr)
+ return peer;
+ }
+ return NULL;
+}
+
+static struct tipc_peer *get_self(struct net *net, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+
+ return mon->self;
+}
+
+static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon)
+{
+ struct tipc_net *tn = tipc_net(net);
+
+ return mon->peer_cnt > tn->mon_threshold;
+}
+
+/* mon_identify_lost_members() : - identify amd mark potentially lost members
+ */
+static void mon_identify_lost_members(struct tipc_peer *peer,
+ struct tipc_mon_domain *dom_bef,
+ int applied_bef)
+{
+ struct tipc_peer *member = peer;
+ struct tipc_mon_domain *dom_aft = peer->domain;
+ int applied_aft = peer->applied;
+ int i;
+
+ for (i = 0; i < applied_bef; i++) {
+ member = peer_nxt(member);
+
+ /* Do nothing if self or peer already see member as down */
+ if (!member->is_up || !map_get(dom_bef->up_map, i))
+ continue;
+
+ /* Loss of local node must be detected by active probing */
+ if (member->is_local)
+ continue;
+
+ /* Start probing if member was removed from applied domain */
+ if (!applied_aft || (applied_aft < i)) {
+ member->down_cnt = 1;
+ continue;
+ }
+
+ /* Member loss is confirmed if it is still in applied domain */
+ if (!map_get(dom_aft->up_map, i))
+ member->down_cnt++;
+ }
+}
+
+/* mon_apply_domain() : match a peer's domain record against monitor list
+ */
+static void mon_apply_domain(struct tipc_monitor *mon,
+ struct tipc_peer *peer)
+{
+ struct tipc_mon_domain *dom = peer->domain;
+ struct tipc_peer *member;
+ u32 addr;
+ int i;
+
+ if (!dom || !peer->is_up)
+ return;
+
+ /* Scan across domain members and match against monitor list */
+ peer->applied = 0;
+ member = peer_nxt(peer);
+ for (i = 0; i < dom->member_cnt; i++) {
+ addr = dom->members[i];
+ if (addr != member->addr)
+ return;
+ peer->applied++;
+ member = peer_nxt(member);
+ }
+}
+
+/* mon_update_local_domain() : update after peer addition/removal/up/down
+ */
+static void mon_update_local_domain(struct tipc_monitor *mon)
+{
+ struct tipc_peer *self = mon->self;
+ struct tipc_mon_domain *cache = &mon->cache;
+ struct tipc_mon_domain *dom = self->domain;
+ struct tipc_peer *peer = self;
+ u64 prev_up_map = dom->up_map;
+ u16 member_cnt, i;
+ bool diff;
+
+ /* Update local domain size based on current size of cluster */
+ member_cnt = dom_size(mon->peer_cnt) - 1;
+ self->applied = member_cnt;
+
+ /* Update native and cached outgoing local domain records */
+ dom->len = dom_rec_len(dom, member_cnt);
+ diff = dom->member_cnt != member_cnt;
+ dom->member_cnt = member_cnt;
+ for (i = 0; i < member_cnt; i++) {
+ peer = peer_nxt(peer);
+ diff |= dom->members[i] != peer->addr;
+ dom->members[i] = peer->addr;
+ map_set(&dom->up_map, i, peer->is_up);
+ cache->members[i] = htonl(peer->addr);
+ }
+ diff |= dom->up_map != prev_up_map;
+ if (!diff)
+ return;
+ dom->gen = ++mon->dom_gen;
+ cache->len = htons(dom->len);
+ cache->gen = htons(dom->gen);
+ cache->member_cnt = htons(member_cnt);
+ cache->up_map = cpu_to_be64(dom->up_map);
+ mon_apply_domain(mon, self);
+}
+
+/* mon_update_neighbors() : update preceding neighbors of added/removed peer
+ */
+static void mon_update_neighbors(struct tipc_monitor *mon,
+ struct tipc_peer *peer)
+{
+ int dz, i;
+
+ dz = dom_size(mon->peer_cnt);
+ for (i = 0; i < dz; i++) {
+ mon_apply_domain(mon, peer);
+ peer = peer_prev(peer);
+ }
+}
+
+/* mon_assign_roles() : reassign peer roles after a network change
+ * The monitor list is consistent at this stage; i.e., each peer is monitoring
+ * a set of domain members as matched between domain record and the monitor list
+ */
+static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head)
+{
+ struct tipc_peer *peer = peer_nxt(head);
+ struct tipc_peer *self = mon->self;
+ int i = 0;
+
+ for (; peer != self; peer = peer_nxt(peer)) {
+ peer->is_local = false;
+
+ /* Update domain member */
+ if (i++ < head->applied) {
+ peer->is_head = false;
+ if (head == self)
+ peer->is_local = true;
+ continue;
+ }
+ /* Assign next domain head */
+ if (!peer->is_up)
+ continue;
+ if (peer->is_head)
+ break;
+ head = peer;
+ head->is_head = true;
+ i = 0;
+ }
+ mon->list_gen++;
+}
+
+void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *peer, *prev, *head;
+
+ write_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (!peer)
+ goto exit;
+ prev = peer_prev(peer);
+ list_del(&peer->list);
+ hlist_del(&peer->hash);
+ kfree(peer->domain);
+ kfree(peer);
+ mon->peer_cnt--;
+ head = peer_head(prev);
+ if (head == self)
+ mon_update_local_domain(mon);
+ mon_update_neighbors(mon, prev);
+
+ /* Revert to full-mesh monitoring if we reach threshold */
+ if (!tipc_mon_is_active(net, mon)) {
+ list_for_each_entry(peer, &self->list, list) {
+ kfree(peer->domain);
+ peer->domain = NULL;
+ peer->applied = 0;
+ }
+ }
+ mon_assign_roles(mon, head);
+exit:
+ write_unlock_bh(&mon->lock);
+}
+
+static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr,
+ struct tipc_peer **peer)
+{
+ struct tipc_peer *self = mon->self;
+ struct tipc_peer *cur, *prev, *p;
+
+ p = kzalloc(sizeof(*p), GFP_ATOMIC);
+ *peer = p;
+ if (!p)
+ return false;
+ p->addr = addr;
+
+ /* Add new peer to lookup list */
+ INIT_LIST_HEAD(&p->list);
+ hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]);
+
+ /* Sort new peer into iterator list, in ascending circular order */
+ prev = self;
+ list_for_each_entry(cur, &self->list, list) {
+ if ((addr > prev->addr) && (addr < cur->addr))
+ break;
+ if (((addr < cur->addr) || (addr > prev->addr)) &&
+ (prev->addr > cur->addr))
+ break;
+ prev = cur;
+ }
+ list_add_tail(&p->list, &cur->list);
+ mon->peer_cnt++;
+ mon_update_neighbors(mon, p);
+ return true;
+}
+
+void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *peer, *head;
+
+ write_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (!peer && !tipc_mon_add_peer(mon, addr, &peer))
+ goto exit;
+ peer->is_up = true;
+ head = peer_head(peer);
+ if (head == self)
+ mon_update_local_domain(mon);
+ mon_assign_roles(mon, head);
+exit:
+ write_unlock_bh(&mon->lock);
+}
+
+void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *peer, *head;
+ struct tipc_mon_domain *dom;
+ int applied;
+
+ write_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (!peer) {
+ pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id);
+ goto exit;
+ }
+ applied = peer->applied;
+ peer->applied = 0;
+ dom = peer->domain;
+ peer->domain = NULL;
+ if (peer->is_head)
+ mon_identify_lost_members(peer, dom, applied);
+ kfree(dom);
+ peer->is_up = false;
+ peer->is_head = false;
+ peer->is_local = false;
+ peer->down_cnt = 0;
+ head = peer_head(peer);
+ if (head == self)
+ mon_update_local_domain(mon);
+ mon_assign_roles(mon, head);
+exit:
+ write_unlock_bh(&mon->lock);
+}
+
+/* tipc_mon_rcv - process monitor domain event message
+ */
+void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
+ struct tipc_mon_state *state, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_mon_domain *arrv_dom = data;
+ struct tipc_mon_domain dom_bef;
+ struct tipc_mon_domain *dom;
+ struct tipc_peer *peer;
+ u16 new_member_cnt = ntohs(arrv_dom->member_cnt);
+ int new_dlen = dom_rec_len(arrv_dom, new_member_cnt);
+ u16 new_gen = ntohs(arrv_dom->gen);
+ u16 acked_gen = ntohs(arrv_dom->ack_gen);
+ bool probing = state->probing;
+ int i, applied_bef;
+
+ state->probing = false;
+ if (!dlen)
+ return;
+
+ /* Sanity check received domain record */
+ if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) {
+ pr_warn_ratelimited("Received illegal domain record\n");
+ return;
+ }
+
+ /* Synch generation numbers with peer if link just came up */
+ if (!state->synched) {
+ state->peer_gen = new_gen - 1;
+ state->acked_gen = acked_gen;
+ state->synched = true;
+ }
+
+ if (more(acked_gen, state->acked_gen))
+ state->acked_gen = acked_gen;
+
+ /* Drop duplicate unless we are waiting for a probe response */
+ if (!more(new_gen, state->peer_gen) && !probing)
+ return;
+
+ write_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (!peer || !peer->is_up)
+ goto exit;
+
+ /* Peer is confirmed, stop any ongoing probing */
+ peer->down_cnt = 0;
+
+ /* Task is done for duplicate record */
+ if (!more(new_gen, state->peer_gen))
+ goto exit;
+
+ state->peer_gen = new_gen;
+
+ /* Cache current domain record for later use */
+ dom_bef.member_cnt = 0;
+ dom = peer->domain;
+ if (dom)
+ memcpy(&dom_bef, dom, dom->len);
+
+ /* Transform and store received domain record */
+ if (!dom || (dom->len < new_dlen)) {
+ kfree(dom);
+ dom = kmalloc(new_dlen, GFP_ATOMIC);
+ peer->domain = dom;
+ if (!dom)
+ goto exit;
+ }
+ dom->len = new_dlen;
+ dom->gen = new_gen;
+ dom->member_cnt = new_member_cnt;
+ dom->up_map = be64_to_cpu(arrv_dom->up_map);
+ for (i = 0; i < new_member_cnt; i++)
+ dom->members[i] = ntohl(arrv_dom->members[i]);
+
+ /* Update peers affected by this domain record */
+ applied_bef = peer->applied;
+ mon_apply_domain(mon, peer);
+ mon_identify_lost_members(peer, &dom_bef, applied_bef);
+ mon_assign_roles(mon, peer_head(peer));
+exit:
+ write_unlock_bh(&mon->lock);
+}
+
+void tipc_mon_prep(struct net *net, void *data, int *dlen,
+ struct tipc_mon_state *state, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_mon_domain *dom = data;
+ u16 gen = mon->dom_gen;
+ u16 len;
+
+ if (!tipc_mon_is_active(net, mon))
+ return;
+
+ /* Send only a dummy record with ack if peer has acked our last sent */
+ if (likely(state->acked_gen == gen)) {
+ len = dom_rec_len(dom, 0);
+ *dlen = len;
+ dom->len = htons(len);
+ dom->gen = htons(gen);
+ dom->ack_gen = htons(state->peer_gen);
+ dom->member_cnt = 0;
+ return;
+ }
+ /* Send the full record */
+ read_lock_bh(&mon->lock);
+ len = ntohs(mon->cache.len);
+ *dlen = len;
+ memcpy(data, &mon->cache, len);
+ read_unlock_bh(&mon->lock);
+ dom->ack_gen = htons(state->peer_gen);
+}
+
+void tipc_mon_get_state(struct net *net, u32 addr,
+ struct tipc_mon_state *state,
+ int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *peer;
+
+ /* Used cached state if table has not changed */
+ if (!state->probing &&
+ (state->list_gen == mon->list_gen) &&
+ (state->acked_gen == mon->dom_gen))
+ return;
+
+ read_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (peer) {
+ state->probing = state->acked_gen != mon->dom_gen;
+ state->probing |= peer->down_cnt;
+ state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS;
+ state->monitoring = peer->is_local;
+ state->monitoring |= peer->is_head;
+ state->list_gen = mon->list_gen;
+ }
+ read_unlock_bh(&mon->lock);
+}
+
+static void mon_timeout(unsigned long m)
+{
+ struct tipc_monitor *mon = (void *)m;
+ struct tipc_peer *self;
+ int best_member_cnt = dom_size(mon->peer_cnt) - 1;
+
+ write_lock_bh(&mon->lock);
+ self = mon->self;
+ if (self && (best_member_cnt != self->applied)) {
+ mon_update_local_domain(mon);
+ mon_assign_roles(mon, self);
+ }
+ write_unlock_bh(&mon->lock);
+ mod_timer(&mon->timer, jiffies + mon->timer_intv);
+}
+
+int tipc_mon_create(struct net *net, int bearer_id)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_monitor *mon;
+ struct tipc_peer *self;
+ struct tipc_mon_domain *dom;
+
+ if (tn->monitors[bearer_id])
+ return 0;
+
+ mon = kzalloc(sizeof(*mon), GFP_ATOMIC);
+ self = kzalloc(sizeof(*self), GFP_ATOMIC);
+ dom = kzalloc(sizeof(*dom), GFP_ATOMIC);
+ if (!mon || !self || !dom) {
+ kfree(mon);
+ kfree(self);
+ kfree(dom);
+ return -ENOMEM;
+ }
+ tn->monitors[bearer_id] = mon;
+ rwlock_init(&mon->lock);
+ mon->net = net;
+ mon->peer_cnt = 1;
+ mon->self = self;
+ self->domain = dom;
+ self->addr = tipc_own_addr(net);
+ self->is_up = true;
+ self->is_head = true;
+ INIT_LIST_HEAD(&self->list);
+ setup_timer(&mon->timer, mon_timeout, (unsigned long)mon);
+ mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff));
+ mod_timer(&mon->timer, jiffies + mon->timer_intv);
+ return 0;
+}
+
+void tipc_mon_delete(struct net *net, int bearer_id)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *peer, *tmp;
+
+ write_lock_bh(&mon->lock);
+ tn->monitors[bearer_id] = NULL;
+ list_for_each_entry_safe(peer, tmp, &self->list, list) {
+ list_del(&peer->list);
+ hlist_del(&peer->hash);
+ kfree(peer->domain);
+ kfree(peer);
+ }
+ mon->self = NULL;
+ write_unlock_bh(&mon->lock);
+ del_timer_sync(&mon->timer);
+ kfree(self->domain);
+ kfree(self);
+ kfree(mon);
+}
+
+int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size)
+{
+ struct tipc_net *tn = tipc_net(net);
+
+ if (cluster_size > TIPC_CLUSTER_SIZE)
+ return -EINVAL;
+
+ tn->mon_threshold = cluster_size;
+
+ return 0;
+}
+
+int tipc_nl_monitor_get_threshold(struct net *net)
+{
+ struct tipc_net *tn = tipc_net(net);
+
+ return tn->mon_threshold;
+}
+
+int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg)
+{
+ struct tipc_mon_domain *dom = peer->domain;
+ struct nlattr *attrs;
+ void *hdr;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ NLM_F_MULTI, TIPC_NL_MON_PEER_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_MON_PEER);
+ if (!attrs)
+ goto msg_full;
+
+ if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_ADDR, peer->addr))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_APPLIED, peer->applied))
+ goto attr_msg_full;
+
+ if (peer->is_up)
+ if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_UP))
+ goto attr_msg_full;
+ if (peer->is_local)
+ if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_LOCAL))
+ goto attr_msg_full;
+ if (peer->is_head)
+ if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_HEAD))
+ goto attr_msg_full;
+
+ if (dom) {
+ if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_DOMGEN, dom->gen))
+ goto attr_msg_full;
+ if (nla_put_u64_64bit(msg->skb, TIPC_NLA_MON_PEER_UPMAP,
+ dom->up_map, TIPC_NLA_MON_PEER_PAD))
+ goto attr_msg_full;
+ if (nla_put(msg->skb, TIPC_NLA_MON_PEER_MEMBERS,
+ dom->member_cnt * sizeof(u32), &dom->members))
+ goto attr_msg_full;
+ }
+
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+ return 0;
+
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg,
+ u32 bearer_id, u32 *prev_node)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *peer;
+
+ if (!mon)
+ return -EINVAL;
+
+ read_lock_bh(&mon->lock);
+ peer = mon->self;
+ do {
+ if (*prev_node) {
+ if (peer->addr == *prev_node)
+ *prev_node = 0;
+ else
+ continue;
+ }
+ if (__tipc_nl_add_monitor_peer(peer, msg)) {
+ *prev_node = peer->addr;
+ read_unlock_bh(&mon->lock);
+ return -EMSGSIZE;
+ }
+ } while ((peer = peer_nxt(peer)) != mon->self);
+ read_unlock_bh(&mon->lock);
+
+ return 0;
+}
+
+int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg,
+ u32 bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ char bearer_name[TIPC_MAX_BEARER_NAME];
+ struct nlattr *attrs;
+ void *hdr;
+ int ret;
+
+ ret = tipc_bearer_get_name(net, bearer_name, bearer_id);
+ if (ret || !mon)
+ return -EINVAL;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ NLM_F_MULTI, TIPC_NL_MON_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_MON);
+ if (!attrs)
+ goto msg_full;
+
+ read_lock_bh(&mon->lock);
+ if (nla_put_u32(msg->skb, TIPC_NLA_MON_REF, bearer_id))
+ goto attr_msg_full;
+ if (tipc_mon_is_active(net, mon))
+ if (nla_put_flag(msg->skb, TIPC_NLA_MON_ACTIVE))
+ goto attr_msg_full;
+ if (nla_put_string(msg->skb, TIPC_NLA_MON_BEARER_NAME, bearer_name))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEERCNT, mon->peer_cnt))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_MON_LISTGEN, mon->list_gen))
+ goto attr_msg_full;
+
+ read_unlock_bh(&mon->lock);
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+attr_msg_full:
+ read_unlock_bh(&mon->lock);
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h
new file mode 100644
index 000000000..2a21b93e0
--- /dev/null
+++ b/net/tipc/monitor.h
@@ -0,0 +1,82 @@
+/*
+ * net/tipc/monitor.h
+ *
+ * Copyright (c) 2015, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_MONITOR_H
+#define _TIPC_MONITOR_H
+
+#include "netlink.h"
+
+/* struct tipc_mon_state: link instance's cache of monitor list and domain state
+ * @list_gen: current generation of this node's monitor list
+ * @gen: current generation of this node's local domain
+ * @peer_gen: most recent domain generation received from peer
+ * @acked_gen: most recent generation of self's domain acked by peer
+ * @monitoring: this peer endpoint should continuously monitored
+ * @probing: peer endpoint should be temporarily probed for potential loss
+ * @synched: domain record's generation has been synched with peer after reset
+ */
+struct tipc_mon_state {
+ u16 list_gen;
+ u16 peer_gen;
+ u16 acked_gen;
+ bool monitoring :1;
+ bool probing :1;
+ bool reset :1;
+ bool synched :1;
+};
+
+int tipc_mon_create(struct net *net, int bearer_id);
+void tipc_mon_delete(struct net *net, int bearer_id);
+
+void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id);
+void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id);
+void tipc_mon_prep(struct net *net, void *data, int *dlen,
+ struct tipc_mon_state *state, int bearer_id);
+void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
+ struct tipc_mon_state *state, int bearer_id);
+void tipc_mon_get_state(struct net *net, u32 addr,
+ struct tipc_mon_state *state,
+ int bearer_id);
+void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id);
+
+int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size);
+int tipc_nl_monitor_get_threshold(struct net *net);
+int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg,
+ u32 bearer_id);
+int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg,
+ u32 bearer_id, u32 *prev_node);
+
+extern const int tipc_max_domain_size;
+#endif
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 6b626a64b..a04fe9be1 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -62,6 +62,8 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
/**
* named_prepare_buf - allocate & initialize a publication message
+ *
+ * The buffer returned is of size INT_H_SIZE + payload size
*/
static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
u32 dest)
@@ -141,9 +143,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
struct publication *publ;
struct sk_buff *skb = NULL;
struct distr_item *item = NULL;
- uint msg_dsz = (tipc_node_get_mtu(net, dnode, 0) / ITEM_SIZE) *
- ITEM_SIZE;
- uint msg_rem = msg_dsz;
+ u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) /
+ ITEM_SIZE) * ITEM_SIZE;
+ u32 msg_rem = msg_dsz;
list_for_each_entry(publ, pls, local_list) {
/* Prepare next buffer: */
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 56935df21..a84daec0a 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -52,7 +52,8 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = {
[TIPC_NLA_MEDIA] = { .type = NLA_NESTED, },
[TIPC_NLA_NODE] = { .type = NLA_NESTED, },
[TIPC_NLA_NET] = { .type = NLA_NESTED, },
- [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, }
+ [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, },
+ [TIPC_NLA_MON] = { .type = NLA_NESTED, },
};
const struct nla_policy
@@ -61,6 +62,12 @@ tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = {
[TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED }
};
+const struct nla_policy tipc_nl_monitor_policy[TIPC_NLA_MON_MAX + 1] = {
+ [TIPC_NLA_MON_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_MON_REF] = { .type = NLA_U32 },
+ [TIPC_NLA_MON_ACTIVATION_THRESHOLD] = { .type = NLA_U32 },
+};
+
const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
[TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC },
[TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 },
@@ -214,7 +221,23 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
.cmd = TIPC_NL_NAME_TABLE_GET,
.dumpit = tipc_nl_name_table_dump,
.policy = tipc_nl_policy,
- }
+ },
+ {
+ .cmd = TIPC_NL_MON_SET,
+ .doit = tipc_nl_node_set_monitor,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_MON_GET,
+ .doit = tipc_nl_node_get_monitor,
+ .dumpit = tipc_nl_node_dump_monitor,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_MON_PEER_GET,
+ .dumpit = tipc_nl_node_dump_monitor_peer,
+ .policy = tipc_nl_policy,
+ },
};
int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h
index ed1dbcb4a..4ba0ad422 100644
--- a/net/tipc/netlink.h
+++ b/net/tipc/netlink.h
@@ -55,6 +55,7 @@ extern const struct nla_policy tipc_nl_prop_policy[];
extern const struct nla_policy tipc_nl_bearer_policy[];
extern const struct nla_policy tipc_nl_media_policy[];
extern const struct nla_policy tipc_nl_udp_policy[];
+extern const struct nla_policy tipc_nl_monitor_policy[];
int tipc_netlink_start(void);
int tipc_netlink_compat_start(void);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 23d476184..21974191e 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -40,6 +40,7 @@
#include "name_distr.h"
#include "socket.h"
#include "bcast.h"
+#include "monitor.h"
#include "discover.h"
#include "netlink.h"
@@ -205,17 +206,6 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr)
return caps;
}
-/*
- * A trivial power-of-two bitmask technique is used for speed, since this
- * operation is done for every incoming TIPC packet. The number of hash table
- * entries has been chosen so that no hash chain exceeds 8 nodes and will
- * usually be much smaller (typically only a single node).
- */
-static unsigned int tipc_hashfn(u32 addr)
-{
- return addr & (NODE_HTABLE_SIZE - 1);
-}
-
static void tipc_node_kref_release(struct kref *kref)
{
struct tipc_node *n = container_of(kref, struct tipc_node, kref);
@@ -279,6 +269,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
u32 addr = 0;
u32 flags = n->action_flags;
u32 link_id = 0;
+ u32 bearer_id;
struct list_head *publ_list;
if (likely(!flags)) {
@@ -288,6 +279,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
addr = n->addr;
link_id = n->link_id;
+ bearer_id = link_id & 0xffff;
publ_list = &n->publ_list;
n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
@@ -301,13 +293,16 @@ static void tipc_node_write_unlock(struct tipc_node *n)
if (flags & TIPC_NOTIFY_NODE_UP)
tipc_named_node_up(net, addr);
- if (flags & TIPC_NOTIFY_LINK_UP)
+ if (flags & TIPC_NOTIFY_LINK_UP) {
+ tipc_mon_peer_up(net, addr, bearer_id);
tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
TIPC_NODE_SCOPE, link_id, addr);
-
- if (flags & TIPC_NOTIFY_LINK_DOWN)
+ }
+ if (flags & TIPC_NOTIFY_LINK_DOWN) {
+ tipc_mon_peer_down(net, addr, bearer_id);
tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
link_id, addr);
+ }
}
struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
@@ -378,14 +373,13 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
{
unsigned long tol = tipc_link_tolerance(l);
unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
- unsigned long keepalive_intv = msecs_to_jiffies(intv);
/* Link with lowest tolerance determines timer interval */
- if (keepalive_intv < n->keepalive_intv)
- n->keepalive_intv = keepalive_intv;
+ if (intv < n->keepalive_intv)
+ n->keepalive_intv = intv;
- /* Ensure link's abort limit corresponds to current interval */
- tipc_link_set_abort_limit(l, tol / jiffies_to_msecs(n->keepalive_intv));
+ /* Ensure link's abort limit corresponds to current tolerance */
+ tipc_link_set_abort_limit(l, tol / n->keepalive_intv);
}
static void tipc_node_delete(struct tipc_node *node)
@@ -526,7 +520,7 @@ static void tipc_node_timeout(unsigned long data)
if (rc & TIPC_LINK_DOWN_EVT)
tipc_node_link_down(n, bearer_id, false);
}
- mod_timer(&n->timer, jiffies + n->keepalive_intv);
+ mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv));
}
/**
@@ -692,6 +686,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
struct tipc_link *l = le->link;
struct tipc_media_addr *maddr;
struct sk_buff_head xmitq;
+ int old_bearer_id = bearer_id;
if (!l)
return;
@@ -711,6 +706,8 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
tipc_link_fsm_evt(l, LINK_RESET_EVT);
}
tipc_node_write_unlock(n);
+ if (delete)
+ tipc_mon_remove_peer(n->net, n->addr, old_bearer_id);
tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
tipc_sk_rcv(n->net, &le->inputq);
}
@@ -735,6 +732,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
bool accept_addr = false;
bool reset = true;
char *if_name;
+ unsigned long intv;
*dupl_addr = false;
*respond = false;
@@ -840,9 +838,11 @@ void tipc_node_check_dest(struct net *net, u32 onode,
le->link = l;
n->link_cnt++;
tipc_node_calculate_timer(n, l);
- if (n->link_cnt == 1)
- if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+ if (n->link_cnt == 1) {
+ intv = jiffies + msecs_to_jiffies(n->keepalive_intv);
+ if (!mod_timer(&n->timer, intv))
tipc_node_get(n);
+ }
}
memcpy(&le->maddr, maddr, sizeof(*maddr));
exit:
@@ -950,7 +950,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
state = SELF_UP_PEER_UP;
break;
case SELF_LOST_CONTACT_EVT:
- state = SELF_DOWN_PEER_LEAVING;
+ state = SELF_DOWN_PEER_DOWN;
break;
case SELF_ESTABL_CONTACT_EVT:
case PEER_LOST_CONTACT_EVT:
@@ -969,7 +969,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
state = SELF_UP_PEER_UP;
break;
case PEER_LOST_CONTACT_EVT:
- state = SELF_LEAVING_PEER_DOWN;
+ state = SELF_DOWN_PEER_DOWN;
break;
case SELF_LOST_CONTACT_EVT:
case PEER_ESTABL_CONTACT_EVT:
@@ -1928,3 +1928,168 @@ out:
return skb->len;
}
+
+int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attrs[TIPC_NLA_MON_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ int err;
+
+ if (!info->attrs[TIPC_NLA_MON])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_MON_MAX,
+ info->attrs[TIPC_NLA_MON],
+ tipc_nl_monitor_policy);
+ if (err)
+ return err;
+
+ if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) {
+ u32 val;
+
+ val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]);
+ err = tipc_nl_monitor_set_threshold(net, val);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg)
+{
+ struct nlattr *attrs;
+ void *hdr;
+ u32 val;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ 0, TIPC_NL_MON_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_MON);
+ if (!attrs)
+ goto msg_full;
+
+ val = tipc_nl_monitor_get_threshold(net);
+
+ if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val))
+ goto attr_msg_full;
+
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tipc_nl_msg msg;
+ int err;
+
+ msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ msg.portid = info->snd_portid;
+ msg.seq = info->snd_seq;
+
+ err = __tipc_nl_add_monitor_prop(net, &msg);
+ if (err) {
+ nlmsg_free(msg.skb);
+ return err;
+ }
+
+ return genlmsg_reply(msg.skb, info);
+}
+
+int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ u32 prev_bearer = cb->args[0];
+ struct tipc_nl_msg msg;
+ int err;
+ int i;
+
+ if (prev_bearer == MAX_BEARERS)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ rtnl_lock();
+ for (i = prev_bearer; i < MAX_BEARERS; i++) {
+ prev_bearer = i;
+ err = __tipc_nl_add_monitor(net, &msg, prev_bearer);
+ if (err)
+ goto out;
+ }
+
+out:
+ rtnl_unlock();
+ cb->args[0] = prev_bearer;
+
+ return skb->len;
+}
+
+int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ u32 prev_node = cb->args[1];
+ u32 bearer_id = cb->args[2];
+ int done = cb->args[0];
+ struct tipc_nl_msg msg;
+ int err;
+
+ if (!prev_node) {
+ struct nlattr **attrs;
+ struct nlattr *mon[TIPC_NLA_MON_MAX + 1];
+
+ err = tipc_nlmsg_parse(cb->nlh, &attrs);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_MON])
+ return -EINVAL;
+
+ err = nla_parse_nested(mon, TIPC_NLA_MON_MAX,
+ attrs[TIPC_NLA_MON],
+ tipc_nl_monitor_policy);
+ if (err)
+ return err;
+
+ if (!mon[TIPC_NLA_MON_REF])
+ return -EINVAL;
+
+ bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]);
+
+ if (bearer_id >= MAX_BEARERS)
+ return -EINVAL;
+ }
+
+ if (done)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ rtnl_lock();
+ err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node);
+ if (!err)
+ done = 1;
+
+ rtnl_unlock();
+ cb->args[0] = done;
+ cb->args[1] = prev_node;
+ cb->args[2] = bearer_id;
+
+ return skb->len;
+}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 8264b3d97..d69fdfcc0 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -78,4 +78,9 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb);
+int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
+ struct netlink_callback *cb);
#endif
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 272d20a79..215849ce4 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -418,13 +418,12 @@ static struct outqueue_entry *tipc_alloc_entry(void *data, int len)
if (!entry)
return NULL;
- buf = kmalloc(len, GFP_ATOMIC);
+ buf = kmemdup(data, len, GFP_ATOMIC);
if (!buf) {
kfree(entry);
return NULL;
}
- memcpy(buf, data, len);
entry->iov.iov_base = buf;
entry->iov.iov_len = len;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index c9cf2be36..ae7e14cae 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -63,7 +63,7 @@
*/
struct udp_media_addr {
__be16 proto;
- __be16 udp_port;
+ __be16 port;
union {
struct in_addr ipv4;
struct in6_addr ipv6;
@@ -108,9 +108,9 @@ static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size)
struct udp_media_addr *ua = (struct udp_media_addr *)&a->value;
if (ntohs(ua->proto) == ETH_P_IP)
- snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->udp_port));
+ snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port));
else if (ntohs(ua->proto) == ETH_P_IPV6)
- snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->udp_port));
+ snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port));
else
pr_err("Invalid UDP media address\n");
return 0;
@@ -178,8 +178,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
skb->dev = rt->dst.dev;
ttl = ip4_dst_hoplimit(&rt->dst);
udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr,
- dst->ipv4.s_addr, 0, ttl, 0, src->udp_port,
- dst->udp_port, false, true);
+ dst->ipv4.s_addr, 0, ttl, 0, src->port,
+ dst->port, false, true);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct dst_entry *ndst;
@@ -196,8 +196,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
ttl = ip6_dst_hoplimit(ndst);
err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb,
ndst->dev, &src->ipv6,
- &dst->ipv6, 0, ttl, 0, src->udp_port,
- dst->udp_port, false);
+ &dst->ipv6, 0, ttl, 0, src->port,
+ dst->port, false);
#endif
}
return err;
@@ -292,12 +292,12 @@ err:
ip4 = (struct sockaddr_in *)&sa_local;
local->proto = htons(ETH_P_IP);
- local->udp_port = ip4->sin_port;
+ local->port = ip4->sin_port;
local->ipv4.s_addr = ip4->sin_addr.s_addr;
ip4 = (struct sockaddr_in *)&sa_remote;
remote->proto = htons(ETH_P_IP);
- remote->udp_port = ip4->sin_port;
+ remote->port = ip4->sin_port;
remote->ipv4.s_addr = ip4->sin_addr.s_addr;
return 0;
@@ -312,13 +312,13 @@ err:
return -EINVAL;
local->proto = htons(ETH_P_IPV6);
- local->udp_port = ip6->sin6_port;
+ local->port = ip6->sin6_port;
memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
ub->ifindex = ip6->sin6_scope_id;
ip6 = (struct sockaddr_in6 *)&sa_remote;
remote->proto = htons(ETH_P_IPV6);
- remote->udp_port = ip6->sin6_port;
+ remote->port = ip6->sin6_port;
memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
return 0;
#endif
@@ -386,7 +386,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
err = -EAFNOSUPPORT;
goto err;
}
- udp_conf.local_udp_port = local.udp_port;
+ udp_conf.local_udp_port = local.port;
err = udp_sock_create(net, &udp_conf, &ub->ubsock);
if (err)
goto err;
@@ -396,10 +396,13 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
tuncfg.encap_destroy = NULL;
setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg);
- if (enable_mcast(ub, remote))
+ err = enable_mcast(ub, remote);
+ if (err)
goto err;
return 0;
err:
+ if (ub->ubsock)
+ udp_tunnel_sock_release(ub->ubsock);
kfree(ub);
return err;
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e444fa47e..8309687a5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -769,6 +769,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
lockdep_set_class(&sk->sk_receive_queue.lock,
&af_unix_sk_receive_queue_lock_key);
+ sk->sk_allocation = GFP_KERNEL_ACCOUNT;
sk->sk_write_space = unix_write_space;
sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
sk->sk_destruct = unix_sock_destructor;
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index 14810abed..8831e7c42 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -26,3 +26,23 @@ config VMWARE_VMCI_VSOCKETS
To compile this driver as a module, choose M here: the module
will be called vmw_vsock_vmci_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS
+ tristate "virtio transport for Virtual Sockets"
+ depends on VSOCKETS && VIRTIO
+ select VIRTIO_VSOCKETS_COMMON
+ help
+ This module implements a virtio transport for Virtual Sockets.
+
+ Enable this transport if your Virtual Machine host supports Virtual
+ Sockets over virtio.
+
+ To compile this driver as a module, choose M here: the module will be
+ called vmw_vsock_virtio_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS_COMMON
+ tristate
+ help
+ This option is selected by any driver which needs to access
+ the virtio_vsock. The module will be called
+ vmw_vsock_virtio_transport_common.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 2ce52d70f..bc27c70e0 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,7 +1,13 @@
obj-$(CONFIG_VSOCKETS) += vsock.o
obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
vsock-y += af_vsock.o vsock_addr.o
vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
vmci_transport_notify_qstate.o
+
+vmw_vsock_virtio_transport-y += virtio_transport.o
+
+vmw_vsock_virtio_transport_common-y += virtio_transport_common.o
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index b96ac918e..8a398b3fb 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -344,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk)
return ret;
}
+void vsock_remove_sock(struct vsock_sock *vsk)
+{
+ if (vsock_in_bound_table(vsk))
+ vsock_remove_bound(vsk);
+
+ if (vsock_in_connected_table(vsk))
+ vsock_remove_connected(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_sock);
+
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
{
int i;
@@ -455,6 +465,8 @@ void vsock_pending_work(struct work_struct *work)
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
+
+ listener->sk_ack_backlog--;
} else if (!vsk->rejected) {
/* We are not on the pending list and accept() did not reject
* us, so we must have been accepted by our user process. We
@@ -465,8 +477,6 @@ void vsock_pending_work(struct work_struct *work)
goto out;
}
- listener->sk_ack_backlog--;
-
/* We need to remove ourself from the global connected sockets list so
* incoming packets can't find this socket, and to reduce the reference
* count.
@@ -660,12 +670,6 @@ static void __vsock_release(struct sock *sk)
vsk = vsock_sk(sk);
pending = NULL; /* Compiler warning. */
- if (vsock_in_bound_table(vsk))
- vsock_remove_bound(vsk);
-
- if (vsock_in_connected_table(vsk))
- vsock_remove_connected(vsk);
-
transport->release(vsk);
lock_sock(sk);
@@ -1995,7 +1999,16 @@ void vsock_core_exit(void)
}
EXPORT_SYMBOL_GPL(vsock_core_exit);
+const struct vsock_transport *vsock_core_get_transport(void)
+{
+ /* vsock_register_mutex not taken since only the transport uses this
+ * function and only while registered.
+ */
+ return transport;
+}
+EXPORT_SYMBOL_GPL(vsock_core_get_transport);
+
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Socket Family");
-MODULE_VERSION("1.0.1.0-k");
+MODULE_VERSION("1.0.2.0-k");
MODULE_LICENSE("GPL v2");
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
new file mode 100644
index 000000000..936d7eee6
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport.c
@@ -0,0 +1,620 @@
+/*
+ * virtio transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * Some of the code is take from Gerd Hoffmann <kraxel@redhat.com>'s
+ * early virtio-vsock proof-of-concept bits.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+#include <net/sock.h>
+#include <linux/mutex.h>
+#include <net/af_vsock.h>
+
+static struct workqueue_struct *virtio_vsock_workqueue;
+static struct virtio_vsock *the_virtio_vsock;
+static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */
+
+struct virtio_vsock {
+ struct virtio_device *vdev;
+ struct virtqueue *vqs[VSOCK_VQ_MAX];
+
+ /* Virtqueue processing is deferred to a workqueue */
+ struct work_struct tx_work;
+ struct work_struct rx_work;
+ struct work_struct event_work;
+
+ /* The following fields are protected by tx_lock. vqs[VSOCK_VQ_TX]
+ * must be accessed with tx_lock held.
+ */
+ struct mutex tx_lock;
+
+ struct work_struct send_pkt_work;
+ spinlock_t send_pkt_list_lock;
+ struct list_head send_pkt_list;
+
+ atomic_t queued_replies;
+
+ /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
+ * must be accessed with rx_lock held.
+ */
+ struct mutex rx_lock;
+ int rx_buf_nr;
+ int rx_buf_max_nr;
+
+ /* The following fields are protected by event_lock.
+ * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held.
+ */
+ struct mutex event_lock;
+ struct virtio_vsock_event event_list[8];
+
+ u32 guest_cid;
+};
+
+static struct virtio_vsock *virtio_vsock_get(void)
+{
+ return the_virtio_vsock;
+}
+
+static u32 virtio_transport_get_local_cid(void)
+{
+ struct virtio_vsock *vsock = virtio_vsock_get();
+
+ return vsock->guest_cid;
+}
+
+static void
+virtio_transport_send_pkt_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, send_pkt_work);
+ struct virtqueue *vq;
+ bool added = false;
+ bool restart_rx = false;
+
+ mutex_lock(&vsock->tx_lock);
+
+ vq = vsock->vqs[VSOCK_VQ_TX];
+
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ struct scatterlist hdr, buf, *sgs[2];
+ int ret, in_sg = 0, out_sg = 0;
+ bool reply;
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ if (list_empty(&vsock->send_pkt_list)) {
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ break;
+ }
+
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ reply = pkt->reply;
+
+ sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+ sgs[out_sg++] = &hdr;
+ if (pkt->buf) {
+ sg_init_one(&buf, pkt->buf, pkt->len);
+ sgs[out_sg++] = &buf;
+ }
+
+ ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL);
+ /* Usually this means that there is no more space available in
+ * the vq
+ */
+ if (ret < 0) {
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ break;
+ }
+
+ if (reply) {
+ struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+ int val;
+
+ val = atomic_dec_return(&vsock->queued_replies);
+
+ /* Do we now have resources to resume rx processing? */
+ if (val + 1 == virtqueue_get_vring_size(rx_vq))
+ restart_rx = true;
+ }
+
+ added = true;
+ }
+
+ if (added)
+ virtqueue_kick(vq);
+
+ mutex_unlock(&vsock->tx_lock);
+
+ if (restart_rx)
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static int
+virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock *vsock;
+ int len = pkt->len;
+
+ vsock = virtio_vsock_get();
+ if (!vsock) {
+ virtio_transport_free_pkt(pkt);
+ return -ENODEV;
+ }
+
+ if (pkt->reply)
+ atomic_inc(&vsock->queued_replies);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add_tail(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+ return len;
+}
+
+static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
+{
+ int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+ struct virtio_vsock_pkt *pkt;
+ struct scatterlist hdr, buf, *sgs[2];
+ struct virtqueue *vq;
+ int ret;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ do {
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ break;
+
+ pkt->buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!pkt->buf) {
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+
+ pkt->len = buf_len;
+
+ sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+ sgs[0] = &hdr;
+
+ sg_init_one(&buf, pkt->buf, buf_len);
+ sgs[1] = &buf;
+ ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
+ if (ret) {
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+ vsock->rx_buf_nr++;
+ } while (vq->num_free);
+ if (vsock->rx_buf_nr > vsock->rx_buf_max_nr)
+ vsock->rx_buf_max_nr = vsock->rx_buf_nr;
+ virtqueue_kick(vq);
+}
+
+static void virtio_transport_tx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, tx_work);
+ struct virtqueue *vq;
+ bool added = false;
+
+ vq = vsock->vqs[VSOCK_VQ_TX];
+ mutex_lock(&vsock->tx_lock);
+ do {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ virtqueue_disable_cb(vq);
+ while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) {
+ virtio_transport_free_pkt(pkt);
+ added = true;
+ }
+ } while (!virtqueue_enable_cb(vq));
+ mutex_unlock(&vsock->tx_lock);
+
+ if (added)
+ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+}
+
+/* Is there space left for replies to rx packets? */
+static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
+{
+ struct virtqueue *vq = vsock->vqs[VSOCK_VQ_RX];
+ int val;
+
+ smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
+ val = atomic_read(&vsock->queued_replies);
+
+ return val < virtqueue_get_vring_size(vq);
+}
+
+static void virtio_transport_rx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, rx_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ mutex_lock(&vsock->rx_lock);
+
+ do {
+ virtqueue_disable_cb(vq);
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ if (!virtio_transport_more_replies(vsock)) {
+ /* Stop rx until the device processes already
+ * pending replies. Leave rx virtqueue
+ * callbacks disabled.
+ */
+ goto out;
+ }
+
+ pkt = virtqueue_get_buf(vq, &len);
+ if (!pkt) {
+ break;
+ }
+
+ vsock->rx_buf_nr--;
+
+ /* Drop short/long packets */
+ if (unlikely(len < sizeof(pkt->hdr) ||
+ len > sizeof(pkt->hdr) + pkt->len)) {
+ virtio_transport_free_pkt(pkt);
+ continue;
+ }
+
+ pkt->len = len - sizeof(pkt->hdr);
+ virtio_transport_recv_pkt(pkt);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+out:
+ if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+}
+
+/* event_lock must be held */
+static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
+ struct virtio_vsock_event *event)
+{
+ struct scatterlist sg;
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+ sg_init_one(&sg, event, sizeof(*event));
+
+ return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_fill(struct virtio_vsock *vsock)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(vsock->event_list); i++) {
+ struct virtio_vsock_event *event = &vsock->event_list[i];
+
+ virtio_vsock_event_fill_one(vsock, event);
+ }
+
+ virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+}
+
+static void virtio_vsock_reset_sock(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = ECONNRESET;
+ sk->sk_error_report(sk);
+ release_sock(sk);
+}
+
+static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
+{
+ struct virtio_device *vdev = vsock->vdev;
+ u64 guest_cid;
+
+ vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
+ &guest_cid, sizeof(guest_cid));
+ vsock->guest_cid = le64_to_cpu(guest_cid);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_handle(struct virtio_vsock *vsock,
+ struct virtio_vsock_event *event)
+{
+ switch (le32_to_cpu(event->id)) {
+ case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET:
+ virtio_vsock_update_guest_cid(vsock);
+ vsock_for_each_connected_socket(virtio_vsock_reset_sock);
+ break;
+ }
+}
+
+static void virtio_transport_event_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, event_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+ mutex_lock(&vsock->event_lock);
+
+ do {
+ struct virtio_vsock_event *event;
+ unsigned int len;
+
+ virtqueue_disable_cb(vq);
+ while ((event = virtqueue_get_buf(vq, &len)) != NULL) {
+ if (len == sizeof(*event))
+ virtio_vsock_event_handle(vsock, event);
+
+ virtio_vsock_event_fill_one(vsock, event);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+ virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+
+ mutex_unlock(&vsock->event_lock);
+}
+
+static void virtio_vsock_event_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->event_work);
+}
+
+static void virtio_vsock_tx_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->tx_work);
+}
+
+static void virtio_vsock_rx_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static struct virtio_transport virtio_transport = {
+ .transport = {
+ .get_local_cid = virtio_transport_get_local_cid,
+
+ .init = virtio_transport_do_socket_init,
+ .destruct = virtio_transport_destruct,
+ .release = virtio_transport_release,
+ .connect = virtio_transport_connect,
+ .shutdown = virtio_transport_shutdown,
+
+ .dgram_bind = virtio_transport_dgram_bind,
+ .dgram_dequeue = virtio_transport_dgram_dequeue,
+ .dgram_enqueue = virtio_transport_dgram_enqueue,
+ .dgram_allow = virtio_transport_dgram_allow,
+
+ .stream_dequeue = virtio_transport_stream_dequeue,
+ .stream_enqueue = virtio_transport_stream_enqueue,
+ .stream_has_data = virtio_transport_stream_has_data,
+ .stream_has_space = virtio_transport_stream_has_space,
+ .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
+ .stream_is_active = virtio_transport_stream_is_active,
+ .stream_allow = virtio_transport_stream_allow,
+
+ .notify_poll_in = virtio_transport_notify_poll_in,
+ .notify_poll_out = virtio_transport_notify_poll_out,
+ .notify_recv_init = virtio_transport_notify_recv_init,
+ .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
+ .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
+ .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+ .notify_send_init = virtio_transport_notify_send_init,
+ .notify_send_pre_block = virtio_transport_notify_send_pre_block,
+ .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
+ .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+ .set_buffer_size = virtio_transport_set_buffer_size,
+ .set_min_buffer_size = virtio_transport_set_min_buffer_size,
+ .set_max_buffer_size = virtio_transport_set_max_buffer_size,
+ .get_buffer_size = virtio_transport_get_buffer_size,
+ .get_min_buffer_size = virtio_transport_get_min_buffer_size,
+ .get_max_buffer_size = virtio_transport_get_max_buffer_size,
+ },
+
+ .send_pkt = virtio_transport_send_pkt,
+};
+
+static int virtio_vsock_probe(struct virtio_device *vdev)
+{
+ vq_callback_t *callbacks[] = {
+ virtio_vsock_rx_done,
+ virtio_vsock_tx_done,
+ virtio_vsock_event_done,
+ };
+ static const char * const names[] = {
+ "rx",
+ "tx",
+ "event",
+ };
+ struct virtio_vsock *vsock = NULL;
+ int ret;
+
+ ret = mutex_lock_interruptible(&the_virtio_vsock_mutex);
+ if (ret)
+ return ret;
+
+ /* Only one virtio-vsock device per guest is supported */
+ if (the_virtio_vsock) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ vsock = kzalloc(sizeof(*vsock), GFP_KERNEL);
+ if (!vsock) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vsock->vdev = vdev;
+
+ ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+ vsock->vqs, callbacks, names);
+ if (ret < 0)
+ goto out;
+
+ virtio_vsock_update_guest_cid(vsock);
+
+ ret = vsock_core_init(&virtio_transport.transport);
+ if (ret < 0)
+ goto out_vqs;
+
+ vsock->rx_buf_nr = 0;
+ vsock->rx_buf_max_nr = 0;
+ atomic_set(&vsock->queued_replies, 0);
+
+ vdev->priv = vsock;
+ the_virtio_vsock = vsock;
+ mutex_init(&vsock->tx_lock);
+ mutex_init(&vsock->rx_lock);
+ mutex_init(&vsock->event_lock);
+ spin_lock_init(&vsock->send_pkt_list_lock);
+ INIT_LIST_HEAD(&vsock->send_pkt_list);
+ INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
+ INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
+ INIT_WORK(&vsock->event_work, virtio_transport_event_work);
+ INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+
+ mutex_lock(&vsock->rx_lock);
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->event_lock);
+ virtio_vsock_event_fill(vsock);
+ mutex_unlock(&vsock->event_lock);
+
+ mutex_unlock(&the_virtio_vsock_mutex);
+ return 0;
+
+out_vqs:
+ vsock->vdev->config->del_vqs(vsock->vdev);
+out:
+ kfree(vsock);
+ mutex_unlock(&the_virtio_vsock_mutex);
+ return ret;
+}
+
+static void virtio_vsock_remove(struct virtio_device *vdev)
+{
+ struct virtio_vsock *vsock = vdev->priv;
+ struct virtio_vsock_pkt *pkt;
+
+ flush_work(&vsock->rx_work);
+ flush_work(&vsock->tx_work);
+ flush_work(&vsock->event_work);
+ flush_work(&vsock->send_pkt_work);
+
+ vdev->config->reset(vdev);
+
+ mutex_lock(&vsock->rx_lock);
+ while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX])))
+ virtio_transport_free_pkt(pkt);
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->tx_lock);
+ while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX])))
+ virtio_transport_free_pkt(pkt);
+ mutex_unlock(&vsock->tx_lock);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ while (!list_empty(&vsock->send_pkt_list)) {
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ mutex_lock(&the_virtio_vsock_mutex);
+ the_virtio_vsock = NULL;
+ vsock_core_exit();
+ mutex_unlock(&the_virtio_vsock_mutex);
+
+ vdev->config->del_vqs(vdev);
+
+ kfree(vsock);
+}
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver virtio_vsock_driver = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = virtio_vsock_probe,
+ .remove = virtio_vsock_remove,
+};
+
+static int __init virtio_vsock_init(void)
+{
+ int ret;
+
+ virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0);
+ if (!virtio_vsock_workqueue)
+ return -ENOMEM;
+ ret = register_virtio_driver(&virtio_vsock_driver);
+ if (ret)
+ destroy_workqueue(virtio_vsock_workqueue);
+ return ret;
+}
+
+static void __exit virtio_vsock_exit(void)
+{
+ unregister_virtio_driver(&virtio_vsock_driver);
+ destroy_workqueue(virtio_vsock_workqueue);
+}
+
+module_init(virtio_vsock_init);
+module_exit(virtio_vsock_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("virtio transport for vsock");
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
new file mode 100644
index 000000000..a53b3a16b
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -0,0 +1,992 @@
+/*
+ * common code for virtio vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vsock_virtio_transport_common.h>
+
+/* How long to wait for graceful shutdown of a connection */
+#define VSOCK_CLOSE_TIMEOUT (8 * HZ)
+
+static const struct virtio_transport *virtio_transport_get_ops(void)
+{
+ const struct vsock_transport *t = vsock_core_get_transport();
+
+ return container_of(t, struct virtio_transport, transport);
+}
+
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
+ size_t len,
+ u32 src_cid,
+ u32 src_port,
+ u32 dst_cid,
+ u32 dst_port)
+{
+ struct virtio_vsock_pkt *pkt;
+ int err;
+
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ return NULL;
+
+ pkt->hdr.type = cpu_to_le16(info->type);
+ pkt->hdr.op = cpu_to_le16(info->op);
+ pkt->hdr.src_cid = cpu_to_le64(src_cid);
+ pkt->hdr.dst_cid = cpu_to_le64(dst_cid);
+ pkt->hdr.src_port = cpu_to_le32(src_port);
+ pkt->hdr.dst_port = cpu_to_le32(dst_port);
+ pkt->hdr.flags = cpu_to_le32(info->flags);
+ pkt->len = len;
+ pkt->hdr.len = cpu_to_le32(len);
+ pkt->reply = info->reply;
+
+ if (info->msg && len > 0) {
+ pkt->buf = kmalloc(len, GFP_KERNEL);
+ if (!pkt->buf)
+ goto out_pkt;
+ err = memcpy_from_msg(pkt->buf, info->msg, len);
+ if (err)
+ goto out;
+ }
+
+ trace_virtio_transport_alloc_pkt(src_cid, src_port,
+ dst_cid, dst_port,
+ len,
+ info->type,
+ info->op,
+ info->flags);
+
+ return pkt;
+
+out:
+ kfree(pkt->buf);
+out_pkt:
+ kfree(pkt);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
+
+static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt_info *info)
+{
+ u32 src_cid, src_port, dst_cid, dst_port;
+ struct virtio_vsock_sock *vvs;
+ struct virtio_vsock_pkt *pkt;
+ u32 pkt_len = info->pkt_len;
+
+ src_cid = vm_sockets_get_local_cid();
+ src_port = vsk->local_addr.svm_port;
+ if (!info->remote_cid) {
+ dst_cid = vsk->remote_addr.svm_cid;
+ dst_port = vsk->remote_addr.svm_port;
+ } else {
+ dst_cid = info->remote_cid;
+ dst_port = info->remote_port;
+ }
+
+ vvs = vsk->trans;
+
+ /* we can send less than pkt_len bytes */
+ if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+ pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+
+ /* virtio_transport_get_credit might return less than pkt_len credit */
+ pkt_len = virtio_transport_get_credit(vvs, pkt_len);
+
+ /* Do not send zero length OP_RW pkt */
+ if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+ return pkt_len;
+
+ pkt = virtio_transport_alloc_pkt(info, pkt_len,
+ src_cid, src_port,
+ dst_cid, dst_port);
+ if (!pkt) {
+ virtio_transport_put_credit(vvs, pkt_len);
+ return -ENOMEM;
+ }
+
+ virtio_transport_inc_tx_pkt(vvs, pkt);
+
+ return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+ struct virtio_vsock_pkt *pkt)
+{
+ vvs->rx_bytes += pkt->len;
+}
+
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
+ struct virtio_vsock_pkt *pkt)
+{
+ vvs->rx_bytes -= pkt->len;
+ vvs->fwd_cnt += pkt->len;
+}
+
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
+{
+ spin_lock_bh(&vvs->tx_lock);
+ pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
+ pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
+ spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+ u32 ret;
+
+ spin_lock_bh(&vvs->tx_lock);
+ ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ if (ret > credit)
+ ret = credit;
+ vvs->tx_cnt += ret;
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
+
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->tx_cnt -= credit;
+ spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
+
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
+ int type,
+ struct virtio_vsock_hdr *hdr)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
+ .type = type,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+static ssize_t
+virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ struct virtio_vsock_pkt *pkt;
+ size_t bytes, total = 0;
+ int err = -EFAULT;
+
+ spin_lock_bh(&vvs->rx_lock);
+ while (total < len && !list_empty(&vvs->rx_queue)) {
+ pkt = list_first_entry(&vvs->rx_queue,
+ struct virtio_vsock_pkt, list);
+
+ bytes = len - total;
+ if (bytes > pkt->len - pkt->off)
+ bytes = pkt->len - pkt->off;
+
+ /* sk_lock is held by caller so no one else can dequeue.
+ * Unlock rx_lock since memcpy_to_msg() may sleep.
+ */
+ spin_unlock_bh(&vvs->rx_lock);
+
+ err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+ if (err)
+ goto out;
+
+ spin_lock_bh(&vvs->rx_lock);
+
+ total += bytes;
+ pkt->off += bytes;
+ if (pkt->off == pkt->len) {
+ virtio_transport_dec_rx_pkt(vvs, pkt);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ }
+ spin_unlock_bh(&vvs->rx_lock);
+
+ /* Send a credit pkt to peer */
+ virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+ NULL);
+
+ return total;
+
+out:
+ if (total)
+ err = total;
+ return err;
+}
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags)
+{
+ if (flags & MSG_PEEK)
+ return -EOPNOTSUPP;
+
+ return virtio_transport_stream_do_dequeue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ spin_lock_bh(&vvs->rx_lock);
+ bytes = vvs->rx_bytes;
+ spin_unlock_bh(&vvs->rx_lock);
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+
+static s64 virtio_transport_has_space(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ if (bytes < 0)
+ bytes = 0;
+
+ return bytes;
+}
+
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ spin_lock_bh(&vvs->tx_lock);
+ bytes = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+ struct vsock_sock *psk)
+{
+ struct virtio_vsock_sock *vvs;
+
+ vvs = kzalloc(sizeof(*vvs), GFP_KERNEL);
+ if (!vvs)
+ return -ENOMEM;
+
+ vsk->trans = vvs;
+ vvs->vsk = vsk;
+ if (psk) {
+ struct virtio_vsock_sock *ptrans = psk->trans;
+
+ vvs->buf_size = ptrans->buf_size;
+ vvs->buf_size_min = ptrans->buf_size_min;
+ vvs->buf_size_max = ptrans->buf_size_max;
+ vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
+ } else {
+ vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
+ vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
+ vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
+ }
+
+ vvs->buf_alloc = vvs->buf_size;
+
+ spin_lock_init(&vvs->rx_lock);
+ spin_lock_init(&vvs->tx_lock);
+ INIT_LIST_HEAD(&vvs->rx_queue);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
+
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
+
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size_min;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
+
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size_max;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val < vvs->buf_size_min)
+ vvs->buf_size_min = val;
+ if (val > vvs->buf_size_max)
+ vvs->buf_size_max = val;
+ vvs->buf_size = val;
+ vvs->buf_alloc = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
+
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val > vvs->buf_size)
+ vvs->buf_size = val;
+ vvs->buf_size_min = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
+
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val < vvs->buf_size)
+ vvs->buf_size = val;
+ vvs->buf_size_max = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+ size_t target,
+ bool *data_ready_now)
+{
+ if (vsock_stream_has_data(vsk))
+ *data_ready_now = true;
+ else
+ *data_ready_now = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
+
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+ size_t target,
+ bool *space_avail_now)
+{
+ s64 free_space;
+
+ free_space = vsock_stream_has_space(vsk);
+ if (free_space > 0)
+ *space_avail_now = true;
+ else if (free_space == 0)
+ *space_avail_now = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
+
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
+
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
+
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+ size_t target, ssize_t copied, bool data_read,
+ struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
+
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
+
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
+
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
+
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+ ssize_t written, struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
+
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
+
+bool virtio_transport_stream_allow(u32 cid, u32 port)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
+
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
+
+bool virtio_transport_dgram_allow(u32 cid, u32 port)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
+
+int virtio_transport_connect(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_REQUEST,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_connect);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_SHUTDOWN,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .flags = (mode & RCV_SHUTDOWN ?
+ VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
+ (mode & SEND_SHUTDOWN ?
+ VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
+
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+ struct sockaddr_vm *remote_addr,
+ struct msghdr *msg,
+ size_t dgram_len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RW,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .msg = msg,
+ .pkt_len = len,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
+
+void virtio_transport_destruct(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ kfree(vvs);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+
+static int virtio_transport_reset(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RST,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .reply = !!pkt,
+ };
+
+ /* Send RST only if the original pkt is not a RST pkt */
+ if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ return 0;
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Normally packets are associated with a socket. There may be no socket if an
+ * attempt was made to connect to a socket that does not exist.
+ */
+static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RST,
+ .type = le16_to_cpu(pkt->hdr.type),
+ .reply = true,
+ };
+
+ /* Send RST only if the original pkt is not a RST pkt */
+ if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ return 0;
+
+ pkt = virtio_transport_alloc_pkt(&info, 0,
+ le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port),
+ le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+ if (!pkt)
+ return -ENOMEM;
+
+ return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_wait_close(struct sock *sk, long timeout)
+{
+ if (timeout) {
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout,
+ sock_flag(sk, SOCK_DONE)))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ finish_wait(sk_sleep(sk), &wait);
+ }
+}
+
+static void virtio_transport_do_close(struct vsock_sock *vsk,
+ bool cancel_timeout)
+{
+ struct sock *sk = sk_vsock(vsk);
+
+ sock_set_flag(sk, SOCK_DONE);
+ vsk->peer_shutdown = SHUTDOWN_MASK;
+ if (vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+ sk->sk_state_change(sk);
+
+ if (vsk->close_work_scheduled &&
+ (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+ vsk->close_work_scheduled = false;
+
+ vsock_remove_sock(vsk);
+
+ /* Release refcnt obtained when we scheduled the timeout */
+ sock_put(sk);
+ }
+}
+
+static void virtio_transport_close_timeout(struct work_struct *work)
+{
+ struct vsock_sock *vsk =
+ container_of(work, struct vsock_sock, close_work.work);
+ struct sock *sk = sk_vsock(vsk);
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ if (!sock_flag(sk, SOCK_DONE)) {
+ (void)virtio_transport_reset(vsk, NULL);
+
+ virtio_transport_do_close(vsk, false);
+ }
+
+ vsk->close_work_scheduled = false;
+
+ release_sock(sk);
+ sock_put(sk);
+}
+
+/* User context, vsk->sk is locked */
+static bool virtio_transport_close(struct vsock_sock *vsk)
+{
+ struct sock *sk = &vsk->sk;
+
+ if (!(sk->sk_state == SS_CONNECTED ||
+ sk->sk_state == SS_DISCONNECTING))
+ return true;
+
+ /* Already received SHUTDOWN from peer, reply with RST */
+ if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) {
+ (void)virtio_transport_reset(vsk, NULL);
+ return true;
+ }
+
+ if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+ (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
+
+ if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))
+ virtio_transport_wait_close(sk, sk->sk_lingertime);
+
+ if (sock_flag(sk, SOCK_DONE)) {
+ return true;
+ }
+
+ sock_hold(sk);
+ INIT_DELAYED_WORK(&vsk->close_work,
+ virtio_transport_close_timeout);
+ vsk->close_work_scheduled = true;
+ schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT);
+ return false;
+}
+
+void virtio_transport_release(struct vsock_sock *vsk)
+{
+ struct sock *sk = &vsk->sk;
+ bool remove_sock = true;
+
+ lock_sock(sk);
+ if (sk->sk_type == SOCK_STREAM)
+ remove_sock = virtio_transport_close(vsk);
+ release_sock(sk);
+
+ if (remove_sock)
+ vsock_remove_sock(vsk);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_release);
+
+static int
+virtio_transport_recv_connecting(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ int err;
+ int skerr;
+
+ switch (le16_to_cpu(pkt->hdr.op)) {
+ case VIRTIO_VSOCK_OP_RESPONSE:
+ sk->sk_state = SS_CONNECTED;
+ sk->sk_socket->state = SS_CONNECTED;
+ vsock_insert_connected(vsk);
+ sk->sk_state_change(sk);
+ break;
+ case VIRTIO_VSOCK_OP_INVALID:
+ break;
+ case VIRTIO_VSOCK_OP_RST:
+ skerr = ECONNRESET;
+ err = 0;
+ goto destroy;
+ default:
+ skerr = EPROTO;
+ err = -EINVAL;
+ goto destroy;
+ }
+ return 0;
+
+destroy:
+ virtio_transport_reset(vsk, pkt);
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = skerr;
+ sk->sk_error_report(sk);
+ return err;
+}
+
+static int
+virtio_transport_recv_connected(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ int err = 0;
+
+ switch (le16_to_cpu(pkt->hdr.op)) {
+ case VIRTIO_VSOCK_OP_RW:
+ pkt->len = le32_to_cpu(pkt->hdr.len);
+ pkt->off = 0;
+
+ spin_lock_bh(&vvs->rx_lock);
+ virtio_transport_inc_rx_pkt(vvs, pkt);
+ list_add_tail(&pkt->list, &vvs->rx_queue);
+ spin_unlock_bh(&vvs->rx_lock);
+
+ sk->sk_data_ready(sk);
+ return err;
+ case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+ sk->sk_write_space(sk);
+ break;
+ case VIRTIO_VSOCK_OP_SHUTDOWN:
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+ vsk->peer_shutdown |= RCV_SHUTDOWN;
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+ vsk->peer_shutdown |= SEND_SHUTDOWN;
+ if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+ vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+ if (le32_to_cpu(pkt->hdr.flags))
+ sk->sk_state_change(sk);
+ break;
+ case VIRTIO_VSOCK_OP_RST:
+ virtio_transport_do_close(vsk, true);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ virtio_transport_free_pkt(pkt);
+ return err;
+}
+
+static void
+virtio_transport_recv_disconnecting(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ virtio_transport_do_close(vsk, true);
+}
+
+static int
+virtio_transport_send_response(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RESPONSE,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+ .remote_port = le32_to_cpu(pkt->hdr.src_port),
+ .reply = true,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Handle server socket */
+static int
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct vsock_sock *vchild;
+ struct sock *child;
+
+ if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
+ virtio_transport_reset(vsk, pkt);
+ return -EINVAL;
+ }
+
+ if (sk_acceptq_is_full(sk)) {
+ virtio_transport_reset(vsk, pkt);
+ return -ENOMEM;
+ }
+
+ child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+ sk->sk_type, 0);
+ if (!child) {
+ virtio_transport_reset(vsk, pkt);
+ return -ENOMEM;
+ }
+
+ sk->sk_ack_backlog++;
+
+ lock_sock_nested(child, SINGLE_DEPTH_NESTING);
+
+ child->sk_state = SS_CONNECTED;
+
+ vchild = vsock_sk(child);
+ vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port));
+ vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+
+ vsock_insert_connected(vchild);
+ vsock_enqueue_accept(sk, child);
+ virtio_transport_send_response(vchild, pkt);
+
+ release_sock(child);
+
+ sk->sk_data_ready(sk);
+ return 0;
+}
+
+static bool virtio_transport_space_update(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ bool space_available;
+
+ /* buf_alloc and fwd_cnt is always included in the hdr */
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+ vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+ space_available = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+ return space_available;
+}
+
+/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
+ * lock.
+ */
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct sockaddr_vm src, dst;
+ struct vsock_sock *vsk;
+ struct sock *sk;
+ bool space_available;
+
+ vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+ vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port));
+
+ trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
+ dst.svm_cid, dst.svm_port,
+ le32_to_cpu(pkt->hdr.len),
+ le16_to_cpu(pkt->hdr.type),
+ le16_to_cpu(pkt->hdr.op),
+ le32_to_cpu(pkt->hdr.flags),
+ le32_to_cpu(pkt->hdr.buf_alloc),
+ le32_to_cpu(pkt->hdr.fwd_cnt));
+
+ if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+ (void)virtio_transport_reset_no_sock(pkt);
+ goto free_pkt;
+ }
+
+ /* The socket must be in connected or bound table
+ * otherwise send reset back
+ */
+ sk = vsock_find_connected_socket(&src, &dst);
+ if (!sk) {
+ sk = vsock_find_bound_socket(&dst);
+ if (!sk) {
+ (void)virtio_transport_reset_no_sock(pkt);
+ goto free_pkt;
+ }
+ }
+
+ vsk = vsock_sk(sk);
+
+ space_available = virtio_transport_space_update(sk, pkt);
+
+ lock_sock(sk);
+
+ /* Update CID in case it has changed after a transport reset event */
+ vsk->local_addr.svm_cid = dst.svm_cid;
+
+ if (space_available)
+ sk->sk_write_space(sk);
+
+ switch (sk->sk_state) {
+ case VSOCK_SS_LISTEN:
+ virtio_transport_recv_listen(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ case SS_CONNECTING:
+ virtio_transport_recv_connecting(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ case SS_CONNECTED:
+ virtio_transport_recv_connected(sk, pkt);
+ break;
+ case SS_DISCONNECTING:
+ virtio_transport_recv_disconnecting(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ default:
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+ release_sock(sk);
+
+ /* Release refcnt obtained when we fetched this socket out of the
+ * bound or connected list.
+ */
+ sock_put(sk);
+ return;
+
+free_pkt:
+ virtio_transport_free_pkt(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
+{
+ kfree(pkt->buf);
+ kfree(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 4120b7a53..4be4fbbc0 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1644,6 +1644,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk)
static void vmci_transport_release(struct vsock_sock *vsk)
{
+ vsock_remove_sock(vsk);
+
if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) {
vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle);
vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index da49c0b1f..0f506220a 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -513,6 +513,7 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy,
r = cfg80211_get_chans_dfs_available(wiphy,
chandef->center_freq2,
width);
+ break;
default:
WARN_ON(chandef->center_freq2);
break;
@@ -715,7 +716,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
ASSERT_RTNL();
- if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
+ if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
!(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR))
return false;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index ecca3896b..7645e9736 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -220,7 +220,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
if (WARN_ON(!rdev->scan_req->notified))
- rdev->scan_req->aborted = true;
+ rdev->scan_req->info.aborted = true;
___cfg80211_scan_done(rdev, false);
}
}
@@ -748,6 +748,36 @@ int wiphy_register(struct wiphy *wiphy)
nl80211_send_reg_change_event(&request);
}
+ /* Check that nobody globally advertises any capabilities they do not
+ * advertise on all possible interface types.
+ */
+ if (wiphy->extended_capabilities_len &&
+ wiphy->num_iftype_ext_capab &&
+ wiphy->iftype_ext_capab) {
+ u8 supported_on_all, j;
+ const struct wiphy_iftype_ext_capab *capab;
+
+ capab = wiphy->iftype_ext_capab;
+ for (j = 0; j < wiphy->extended_capabilities_len; j++) {
+ if (capab[0].extended_capabilities_len > j)
+ supported_on_all =
+ capab[0].extended_capabilities[j];
+ else
+ supported_on_all = 0x00;
+ for (i = 1; i < wiphy->num_iftype_ext_capab; i++) {
+ if (j >= capab[i].extended_capabilities_len) {
+ supported_on_all = 0x00;
+ break;
+ }
+ supported_on_all &=
+ capab[i].extended_capabilities[j];
+ }
+ if (WARN_ON(wiphy->extended_capabilities[j] &
+ ~supported_on_all))
+ break;
+ }
+ }
+
rdev->wiphy.registered = true;
rtnl_unlock();
@@ -1057,7 +1087,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
cfg80211_update_iface_num(rdev, wdev->iftype, -1);
if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
if (WARN_ON(!rdev->scan_req->notified))
- rdev->scan_req->aborted = true;
+ rdev->scan_req->info.aborted = true;
___cfg80211_scan_done(rdev, false);
}
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 025b7a5d5..eee914439 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -141,6 +141,18 @@ struct cfg80211_internal_bss {
unsigned long refcount;
atomic_t hold;
+ /* time at the start of the reception of the first octet of the
+ * timestamp field of the last beacon/probe received for this BSS.
+ * The time is the TSF of the BSS specified by %parent_bssid.
+ */
+ u64 parent_tsf;
+
+ /* the BSS according to which %parent_tsf is set. This is set to
+ * the BSS that the interface that requested the scan was connected to
+ * when the beacon/probe was received.
+ */
+ u8 parent_bssid[ETH_ALEN] __aligned(2);
+
/* must be last because of priv member */
struct cfg80211_bss pub;
};
@@ -214,7 +226,7 @@ struct cfg80211_event {
size_t req_ie_len;
size_t resp_ie_len;
struct cfg80211_bss *bss;
- u16 status;
+ int status; /* -1 = failed; 0..65535 = status code */
} cr;
struct {
const u8 *req_ie;
@@ -374,7 +386,7 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
const u8 *resp_ie, size_t resp_ie_len,
- u16 status, bool wextev,
+ int status, bool wextev,
struct cfg80211_bss *bss);
void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
size_t ie_len, u16 reason, bool from_ap);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7d38dd697..4809f4d2c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -167,6 +167,7 @@ __cfg80211_rdev_from_attrs(struct net *netns, struct nlattr **attrs)
if (attrs[NL80211_ATTR_IFINDEX]) {
int ifindex = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]);
+
netdev = __dev_get_by_index(netns, ifindex);
if (netdev) {
if (netdev->ieee80211_ptr)
@@ -404,6 +405,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
[NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED },
[NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 },
+ [NL80211_ATTR_MU_MIMO_GROUP_DATA] = {
+ .len = VHT_MUMIMO_GROUPS_DATA_LEN
+ },
+ [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
};
/* policy for the key attributes */
@@ -731,6 +736,7 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
if (tb[NL80211_KEY_DEFAULT_TYPES]) {
struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
+
err = nla_parse_nested(kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1,
tb[NL80211_KEY_DEFAULT_TYPES],
nl80211_key_default_policy);
@@ -1264,7 +1270,7 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg,
struct nl80211_dump_wiphy_state {
s64 filter_wiphy;
long start;
- long split_start, band_start, chan_start;
+ long split_start, band_start, chan_start, capa_start;
bool split;
};
@@ -1382,6 +1388,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
rdev->ops->get_antenna) {
u32 tx_ant = 0, rx_ant = 0;
int res;
+
res = rdev_get_antenna(rdev, &tx_ant, &rx_ant);
if (!res) {
if (nla_put_u32(msg,
@@ -1761,6 +1768,47 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
nla_nest_end(msg, nested);
}
+ state->split_start++;
+ break;
+ case 13:
+ if (rdev->wiphy.num_iftype_ext_capab &&
+ rdev->wiphy.iftype_ext_capab) {
+ struct nlattr *nested_ext_capab, *nested;
+
+ nested = nla_nest_start(msg,
+ NL80211_ATTR_IFTYPE_EXT_CAPA);
+ if (!nested)
+ goto nla_put_failure;
+
+ for (i = state->capa_start;
+ i < rdev->wiphy.num_iftype_ext_capab; i++) {
+ const struct wiphy_iftype_ext_capab *capab;
+
+ capab = &rdev->wiphy.iftype_ext_capab[i];
+
+ nested_ext_capab = nla_nest_start(msg, i);
+ if (!nested_ext_capab ||
+ nla_put_u32(msg, NL80211_ATTR_IFTYPE,
+ capab->iftype) ||
+ nla_put(msg, NL80211_ATTR_EXT_CAPA,
+ capab->extended_capabilities_len,
+ capab->extended_capabilities) ||
+ nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK,
+ capab->extended_capabilities_len,
+ capab->extended_capabilities_mask))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nested_ext_capab);
+ if (state->split)
+ break;
+ }
+ nla_nest_end(msg, nested);
+ if (i < rdev->wiphy.num_iftype_ext_capab) {
+ state->capa_start = i + 1;
+ break;
+ }
+ }
+
/* done */
state->split_start = 0;
break;
@@ -2116,7 +2164,6 @@ static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info)
return rdev_set_wds_peer(rdev, dev, bssid);
}
-
static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev;
@@ -2251,6 +2298,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] &&
info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]) {
u32 tx_ant, rx_ant;
+
if ((!rdev->wiphy.available_antennas_tx &&
!rdev->wiphy.available_antennas_rx) ||
!rdev->ops->set_antenna)
@@ -2651,6 +2699,38 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
change = true;
}
+ if (info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]) {
+ const u8 *mumimo_groups;
+ u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;
+
+ if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag))
+ return -EOPNOTSUPP;
+
+ mumimo_groups =
+ nla_data(info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]);
+
+ /* bits 0 and 63 are reserved and must be zero */
+ if ((mumimo_groups[0] & BIT(7)) ||
+ (mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN - 1] & BIT(0)))
+ return -EINVAL;
+
+ memcpy(params.vht_mumimo_groups, mumimo_groups,
+ VHT_MUMIMO_GROUPS_DATA_LEN);
+ change = true;
+ }
+
+ if (info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]) {
+ u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;
+
+ if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag))
+ return -EOPNOTSUPP;
+
+ nla_memcpy(params.macaddr,
+ info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR],
+ ETH_ALEN);
+ change = true;
+ }
+
if (flags && (*flags & MONITOR_FLAG_ACTIVE) &&
!(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR))
return -EOPNOTSUPP;
@@ -2919,6 +2999,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
pairwise = !!mac_addr;
if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+
if (kt >= NUM_NL80211_KEYTYPES)
return -EINVAL;
if (kt != NL80211_KEYTYPE_GROUP &&
@@ -3962,7 +4043,6 @@ static int nl80211_dump_station(struct sk_buff *skb,
sta_idx++;
}
-
out:
cb->args[2] = sta_idx;
err = skb->len;
@@ -4366,6 +4446,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]);
if (params.plink_state >= NUM_NL80211_PLINK_STATES)
return -EINVAL;
+ if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) {
+ params.peer_aid = nla_get_u16(
+ info->attrs[NL80211_ATTR_MESH_PEER_AID]);
+ if (params.peer_aid > IEEE80211_MAX_AID)
+ return -EINVAL;
+ }
params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE;
}
@@ -4763,7 +4849,6 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
path_idx++;
}
-
out:
cb->args[2] = path_idx;
err = skb->len;
@@ -5053,7 +5138,6 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
enum nl80211_user_reg_hint_type user_reg_hint_type;
u32 owner_nlportid;
-
/*
* You should only get this when cfg80211 hasn't yet initialized
* completely when built-in to the kernel right between the time
@@ -5245,24 +5329,68 @@ static const struct nla_policy
[NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG },
};
+static int nl80211_check_bool(const struct nlattr *nla, u8 min, u8 max, bool *out)
+{
+ u8 val = nla_get_u8(nla);
+ if (val < min || val > max)
+ return -EINVAL;
+ *out = val;
+ return 0;
+}
+
+static int nl80211_check_u8(const struct nlattr *nla, u8 min, u8 max, u8 *out)
+{
+ u8 val = nla_get_u8(nla);
+ if (val < min || val > max)
+ return -EINVAL;
+ *out = val;
+ return 0;
+}
+
+static int nl80211_check_u16(const struct nlattr *nla, u16 min, u16 max, u16 *out)
+{
+ u16 val = nla_get_u16(nla);
+ if (val < min || val > max)
+ return -EINVAL;
+ *out = val;
+ return 0;
+}
+
+static int nl80211_check_u32(const struct nlattr *nla, u32 min, u32 max, u32 *out)
+{
+ u32 val = nla_get_u32(nla);
+ if (val < min || val > max)
+ return -EINVAL;
+ *out = val;
+ return 0;
+}
+
+static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *out)
+{
+ s32 val = nla_get_s32(nla);
+ if (val < min || val > max)
+ return -EINVAL;
+ *out = val;
+ return 0;
+}
+
static int nl80211_parse_mesh_config(struct genl_info *info,
struct mesh_config *cfg,
u32 *mask_out)
{
struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1];
u32 mask = 0;
+ u16 ht_opmode;
#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, min, max, mask, attr, fn) \
do { \
if (tb[attr]) { \
- if (fn(tb[attr]) < min || fn(tb[attr]) > max) \
+ if (fn(tb[attr], min, max, &cfg->param)) \
return -EINVAL; \
- cfg->param = fn(tb[attr]); \
mask |= (1 << (attr - 1)); \
} \
} while (0)
-
if (!info->attrs[NL80211_ATTR_MESH_CONFIG])
return -EINVAL;
if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX,
@@ -5277,99 +5405,126 @@ do { \
/* Fill in the params struct */
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, 1, 255,
mask, NL80211_MESHCONF_RETRY_TIMEOUT,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, 1, 255,
mask, NL80211_MESHCONF_CONFIRM_TIMEOUT,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, 1, 255,
mask, NL80211_MESHCONF_HOLDING_TIMEOUT,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, 0, 255,
mask, NL80211_MESHCONF_MAX_PEER_LINKS,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, 0, 16,
mask, NL80211_MESHCONF_MAX_RETRIES,
- nla_get_u8);
+ nl80211_check_u8);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, 1, 255,
- mask, NL80211_MESHCONF_TTL, nla_get_u8);
+ mask, NL80211_MESHCONF_TTL, nl80211_check_u8);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, 1, 255,
mask, NL80211_MESHCONF_ELEMENT_TTL,
- nla_get_u8);
+ nl80211_check_u8);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, 0, 1,
mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS,
- nla_get_u8);
+ nl80211_check_bool);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor,
1, 255, mask,
NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR,
- nla_get_u32);
+ nl80211_check_u32);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, 0, 255,
mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
- nla_get_u8);
+ nl80211_check_u8);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, 1, 65535,
mask, NL80211_MESHCONF_PATH_REFRESH_TIME,
- nla_get_u32);
+ nl80211_check_u32);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, 1, 65535,
mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout,
1, 65535, mask,
NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
- nla_get_u32);
+ nl80211_check_u32);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval,
1, 65535, mask,
NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval,
1, 65535, mask,
NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
dot11MeshHWMPnetDiameterTraversalTime,
1, 65535, mask,
NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, 0, 4,
mask, NL80211_MESHCONF_HWMP_ROOTMODE,
- nla_get_u8);
+ nl80211_check_u8);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, 1, 65535,
mask, NL80211_MESHCONF_HWMP_RANN_INTERVAL,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
dot11MeshGateAnnouncementProtocol, 0, 1,
mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS,
- nla_get_u8);
+ nl80211_check_bool);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, 0, 1,
mask, NL80211_MESHCONF_FORWARDING,
- nla_get_u8);
+ nl80211_check_bool);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, -255, 0,
mask, NL80211_MESHCONF_RSSI_THRESHOLD,
- nla_get_s32);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, ht_opmode, 0, 16,
- mask, NL80211_MESHCONF_HT_OPMODE,
- nla_get_u16);
+ nl80211_check_s32);
+ /*
+ * Check HT operation mode based on
+ * IEEE 802.11 2012 8.4.2.59 HT Operation element.
+ */
+ if (tb[NL80211_MESHCONF_HT_OPMODE]) {
+ ht_opmode = nla_get_u16(tb[NL80211_MESHCONF_HT_OPMODE]);
+
+ if (ht_opmode & ~(IEEE80211_HT_OP_MODE_PROTECTION |
+ IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT |
+ IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
+ return -EINVAL;
+
+ if ((ht_opmode & IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT) &&
+ (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
+ return -EINVAL;
+
+ switch (ht_opmode & IEEE80211_HT_OP_MODE_PROTECTION) {
+ case IEEE80211_HT_OP_MODE_PROTECTION_NONE:
+ case IEEE80211_HT_OP_MODE_PROTECTION_20MHZ:
+ if (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)
+ return -EINVAL;
+ break;
+ case IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER:
+ case IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED:
+ if (!(ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
+ return -EINVAL;
+ break;
+ }
+ cfg->ht_opmode = ht_opmode;
+ }
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
1, 65535, mask,
NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
- nla_get_u32);
+ nl80211_check_u32);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, 1, 65535,
mask, NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
dot11MeshHWMPconfirmationInterval,
1, 65535, mask,
NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL,
- nla_get_u16);
+ nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode,
NL80211_MESH_POWER_ACTIVE,
NL80211_MESH_POWER_MAX,
mask, NL80211_MESHCONF_POWER_MODE,
- nla_get_u32);
+ nl80211_check_u32);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration,
0, 65535, mask,
- NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16);
+ NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff,
mask, NL80211_MESHCONF_PLINK_TIMEOUT,
- nla_get_u32);
+ nl80211_check_u32);
if (mask_out)
*mask_out = mask;
@@ -5409,7 +5564,6 @@ static int nl80211_parse_mesh_setup(struct genl_info *info,
IEEE80211_PATH_METRIC_VENDOR :
IEEE80211_PATH_METRIC_AIRTIME;
-
if (tb[NL80211_MESH_SETUP_IE]) {
struct nlattr *ieattr =
tb[NL80211_MESH_SETUP_IE];
@@ -5796,10 +5950,8 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
}
}
- r = set_regdom(rd, REGD_SOURCE_CRDA);
- /* set_regdom took ownership */
- rd = NULL;
-
+ /* set_regdom takes ownership of rd */
+ return set_regdom(rd, REGD_SOURCE_CRDA);
bad_reg:
kfree(rd);
return r;
@@ -6033,6 +6185,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
/* all channels */
for (band = 0; band < NUM_NL80211_BANDS; band++) {
int j;
+
if (!wiphy->bands[band])
continue;
for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
@@ -6104,6 +6257,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
}
}
+ if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) {
+ if (!wiphy_ext_feature_isset(wiphy,
+ NL80211_EXT_FEATURE_SET_SCAN_DWELL)) {
+ err = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ request->duration =
+ nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]);
+ request->duration_mandatory =
+ nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]);
+ }
+
if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) {
request->flags = nla_get_u32(
info->attrs[NL80211_ATTR_SCAN_FLAGS]);
@@ -6442,6 +6608,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
/* all channels */
for (band = 0; band < NUM_NL80211_BANDS; band++) {
int j;
+
if (!wiphy->bands[band])
continue;
for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
@@ -6511,7 +6678,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
nla_data(ssid), nla_len(ssid));
request->match_sets[i].ssid.ssid_len =
nla_len(ssid);
- /* special attribute - old implemenation w/a */
+ /* special attribute - old implementation w/a */
request->match_sets[i].rssi_thold =
default_match_rssi;
rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI];
@@ -6936,6 +7103,13 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
jiffies_to_msecs(jiffies - intbss->ts)))
goto nla_put_failure;
+ if (intbss->parent_tsf &&
+ (nla_put_u64_64bit(msg, NL80211_BSS_PARENT_TSF,
+ intbss->parent_tsf, NL80211_BSS_PAD) ||
+ nla_put(msg, NL80211_BSS_PARENT_BSSID, ETH_ALEN,
+ intbss->parent_bssid)))
+ goto nla_put_failure;
+
if (intbss->ts_boottime &&
nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME,
intbss->ts_boottime, NL80211_BSS_PAD))
@@ -7204,6 +7378,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
if (key.idx >= 0) {
int i;
bool ok = false;
+
for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) {
if (key.p.cipher == rdev->wiphy.cipher_suites[i]) {
ok = true;
@@ -7282,6 +7457,7 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
u16 proto;
+
proto = nla_get_u16(
info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
settings->control_port_ethertype = cpu_to_be16(proto);
@@ -8435,6 +8611,7 @@ static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
for (i = 0; i < rates_len; i++) {
int rate = (rates[i] & 0x7f) * 5;
int ridx;
+
for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
struct ieee80211_rate *srate =
&sband->bitrates[ridx];
@@ -8743,7 +8920,6 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
if (params.wait < NL80211_MIN_REMAIN_ON_CHANNEL_TIME ||
params.wait > rdev->wiphy.max_remain_on_channel_duration)
return -EINVAL;
-
}
params.offchan = info->attrs[NL80211_ATTR_OFFCHANNEL_TX_OK];
@@ -10590,7 +10766,6 @@ int cfg80211_vendor_cmd_reply(struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(cfg80211_vendor_cmd_reply);
-
static int nl80211_set_qos_map(struct sk_buff *skb,
struct genl_info *info)
{
@@ -10945,7 +11120,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_WIPHY,
.doit = nl80211_set_wiphy,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_RTNL,
},
{
@@ -10961,7 +11136,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_INTERFACE,
.doit = nl80211_set_interface,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -10969,7 +11144,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_NEW_INTERFACE,
.doit = nl80211_new_interface,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
NL80211_FLAG_NEED_RTNL,
},
@@ -10977,7 +11152,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DEL_INTERFACE,
.doit = nl80211_del_interface,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -10985,7 +11160,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_GET_KEY,
.doit = nl80211_get_key,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -10993,7 +11168,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_KEY,
.doit = nl80211_set_key,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL |
NL80211_FLAG_CLEAR_SKB,
@@ -11002,7 +11177,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_NEW_KEY,
.doit = nl80211_new_key,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL |
NL80211_FLAG_CLEAR_SKB,
@@ -11011,14 +11186,14 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DEL_KEY,
.doit = nl80211_del_key,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_BEACON,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.doit = nl80211_set_beacon,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
@@ -11026,7 +11201,7 @@ static const struct genl_ops nl80211_ops[] = {
{
.cmd = NL80211_CMD_START_AP,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.doit = nl80211_start_ap,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
@@ -11034,7 +11209,7 @@ static const struct genl_ops nl80211_ops[] = {
{
.cmd = NL80211_CMD_STOP_AP,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.doit = nl80211_stop_ap,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
@@ -11051,7 +11226,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_STATION,
.doit = nl80211_set_station,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11059,7 +11234,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_NEW_STATION,
.doit = nl80211_new_station,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11067,7 +11242,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DEL_STATION,
.doit = nl80211_del_station,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11076,7 +11251,7 @@ static const struct genl_ops nl80211_ops[] = {
.doit = nl80211_get_mpath,
.dumpit = nl80211_dump_mpath,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11085,7 +11260,7 @@ static const struct genl_ops nl80211_ops[] = {
.doit = nl80211_get_mpp,
.dumpit = nl80211_dump_mpp,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11093,7 +11268,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_MPATH,
.doit = nl80211_set_mpath,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11101,7 +11276,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_NEW_MPATH,
.doit = nl80211_new_mpath,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11109,7 +11284,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DEL_MPATH,
.doit = nl80211_del_mpath,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11117,7 +11292,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_BSS,
.doit = nl80211_set_bss,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11156,7 +11331,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_MESH_CONFIG,
.doit = nl80211_update_mesh_config,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11164,7 +11339,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_TRIGGER_SCAN,
.doit = nl80211_trigger_scan,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11172,7 +11347,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_ABORT_SCAN,
.doit = nl80211_abort_scan,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11185,7 +11360,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_START_SCHED_SCAN,
.doit = nl80211_start_sched_scan,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11193,7 +11368,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_STOP_SCHED_SCAN,
.doit = nl80211_stop_sched_scan,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11201,7 +11376,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_AUTHENTICATE,
.doit = nl80211_authenticate,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL |
NL80211_FLAG_CLEAR_SKB,
@@ -11210,7 +11385,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_ASSOCIATE,
.doit = nl80211_associate,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11218,7 +11393,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DEAUTHENTICATE,
.doit = nl80211_deauthenticate,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11226,7 +11401,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DISASSOCIATE,
.doit = nl80211_disassociate,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11234,7 +11409,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_JOIN_IBSS,
.doit = nl80211_join_ibss,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11242,7 +11417,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_LEAVE_IBSS,
.doit = nl80211_leave_ibss,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11252,7 +11427,7 @@ static const struct genl_ops nl80211_ops[] = {
.doit = nl80211_testmode_do,
.dumpit = nl80211_testmode_dump,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
NL80211_FLAG_NEED_RTNL,
},
@@ -11261,7 +11436,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_CONNECT,
.doit = nl80211_connect,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11269,7 +11444,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DISCONNECT,
.doit = nl80211_disconnect,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11277,7 +11452,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_WIPHY_NETNS,
.doit = nl80211_wiphy_netns,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
NL80211_FLAG_NEED_RTNL,
},
@@ -11290,7 +11465,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_PMKSA,
.doit = nl80211_setdel_pmksa,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11298,7 +11473,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DEL_PMKSA,
.doit = nl80211_setdel_pmksa,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11306,7 +11481,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_FLUSH_PMKSA,
.doit = nl80211_flush_pmksa,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11314,7 +11489,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_REMAIN_ON_CHANNEL,
.doit = nl80211_remain_on_channel,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11322,7 +11497,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
.doit = nl80211_cancel_remain_on_channel,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11330,7 +11505,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_TX_BITRATE_MASK,
.doit = nl80211_set_tx_bitrate_mask,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11338,7 +11513,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_REGISTER_FRAME,
.doit = nl80211_register_mgmt,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11346,7 +11521,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_FRAME,
.doit = nl80211_tx_mgmt,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11354,7 +11529,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_FRAME_WAIT_CANCEL,
.doit = nl80211_tx_mgmt_cancel_wait,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11362,7 +11537,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_POWER_SAVE,
.doit = nl80211_set_power_save,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11378,7 +11553,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_CQM,
.doit = nl80211_set_cqm,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11386,7 +11561,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_CHANNEL,
.doit = nl80211_set_channel,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11394,7 +11569,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_WDS_PEER,
.doit = nl80211_set_wds_peer,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11402,7 +11577,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_JOIN_MESH,
.doit = nl80211_join_mesh,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11410,7 +11585,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_LEAVE_MESH,
.doit = nl80211_leave_mesh,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11418,7 +11593,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_JOIN_OCB,
.doit = nl80211_join_ocb,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11426,7 +11601,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_LEAVE_OCB,
.doit = nl80211_leave_ocb,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11443,7 +11618,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_WOWLAN,
.doit = nl80211_set_wowlan,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
NL80211_FLAG_NEED_RTNL,
},
@@ -11452,7 +11627,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_REKEY_OFFLOAD,
.doit = nl80211_set_rekey_data,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL |
NL80211_FLAG_CLEAR_SKB,
@@ -11461,7 +11636,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_TDLS_MGMT,
.doit = nl80211_tdls_mgmt,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11469,7 +11644,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_TDLS_OPER,
.doit = nl80211_tdls_oper,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11477,7 +11652,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_UNEXPECTED_FRAME,
.doit = nl80211_register_unexpected_frame,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11485,7 +11660,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_PROBE_CLIENT,
.doit = nl80211_probe_client,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11493,7 +11668,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_REGISTER_BEACONS,
.doit = nl80211_register_beacons,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
NL80211_FLAG_NEED_RTNL,
},
@@ -11501,7 +11676,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_NOACK_MAP,
.doit = nl80211_set_noack_map,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11509,7 +11684,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_START_P2P_DEVICE,
.doit = nl80211_start_p2p_device,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11517,7 +11692,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_STOP_P2P_DEVICE,
.doit = nl80211_stop_p2p_device,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11525,7 +11700,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_MCAST_RATE,
.doit = nl80211_set_mcast_rate,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11533,7 +11708,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_MAC_ACL,
.doit = nl80211_set_mac_acl,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
@@ -11541,7 +11716,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_RADAR_DETECT,
.doit = nl80211_start_radar_detection,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11554,7 +11729,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_UPDATE_FT_IES,
.doit = nl80211_update_ft_ies,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11562,7 +11737,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_CRIT_PROTOCOL_START,
.doit = nl80211_crit_protocol_start,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11570,7 +11745,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_CRIT_PROTOCOL_STOP,
.doit = nl80211_crit_protocol_stop,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11585,7 +11760,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_COALESCE,
.doit = nl80211_set_coalesce,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
NL80211_FLAG_NEED_RTNL,
},
@@ -11593,7 +11768,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_CHANNEL_SWITCH,
.doit = nl80211_channel_switch,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11602,7 +11777,7 @@ static const struct genl_ops nl80211_ops[] = {
.doit = nl80211_vendor_cmd,
.dumpit = nl80211_vendor_cmd_dump,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
NL80211_FLAG_NEED_RTNL,
},
@@ -11610,7 +11785,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_QOS_MAP,
.doit = nl80211_set_qos_map,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11618,7 +11793,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_ADD_TX_TS,
.doit = nl80211_add_tx_ts,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11626,7 +11801,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_DEL_TX_TS,
.doit = nl80211_del_tx_ts,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11634,7 +11809,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH,
.doit = nl80211_tdls_channel_switch,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11642,7 +11817,7 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH,
.doit = nl80211_tdls_cancel_channel_switch,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
@@ -11708,6 +11883,13 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags))
goto nla_put_failure;
+ if (req->info.scan_start_tsf &&
+ (nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF,
+ req->info.scan_start_tsf, NL80211_BSS_PAD) ||
+ nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN,
+ req->info.tsf_bssid)))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
return -ENOBUFS;
@@ -12092,7 +12274,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
const u8 *resp_ie, size_t resp_ie_len,
- u16 status, gfp_t gfp)
+ int status, gfp_t gfp)
{
struct sk_buff *msg;
void *hdr;
@@ -12110,7 +12292,10 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
(bssid && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) ||
- nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, status) ||
+ nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
+ status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
+ status) ||
+ (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) ||
(req_ie &&
nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
(resp_ie &&
@@ -12126,7 +12311,6 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
nla_put_failure:
genlmsg_cancel(msg, hdr);
nlmsg_free(msg);
-
}
void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
@@ -12165,7 +12349,6 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
nla_put_failure:
genlmsg_cancel(msg, hdr);
nlmsg_free(msg);
-
}
void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
@@ -12203,7 +12386,6 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
nla_put_failure:
genlmsg_cancel(msg, hdr);
nlmsg_free(msg);
-
}
void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
@@ -13545,7 +13727,6 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp)
if (hdr)
genlmsg_cancel(msg, hdr);
nlmsg_free(msg);
-
}
EXPORT_SYMBOL(cfg80211_crit_proto_stopped);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 84d4edf1d..a63f402b1 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -55,7 +55,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
const u8 *resp_ie, size_t resp_ie_len,
- u16 status, gfp_t gfp);
+ int status, gfp_t gfp);
void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index ef2955c89..0358e12be 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -3,6 +3,7 @@
*
* Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2016 Intel Deutschland GmbH
*/
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -194,7 +195,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
if (wdev->netdev)
cfg80211_sme_scan_done(wdev->netdev);
- if (!request->aborted &&
+ if (!request->info.aborted &&
request->flags & NL80211_SCAN_FLAG_FLUSH) {
/* flush entries from previous scans */
spin_lock_bh(&rdev->bss_lock);
@@ -202,10 +203,10 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
spin_unlock_bh(&rdev->bss_lock);
}
- msg = nl80211_build_scan_msg(rdev, wdev, request->aborted);
+ msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted);
#ifdef CONFIG_CFG80211_WEXT
- if (wdev->netdev && !request->aborted) {
+ if (wdev->netdev && !request->info.aborted) {
memset(&wrqu, 0, sizeof(wrqu));
wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL);
@@ -236,12 +237,13 @@ void __cfg80211_scan_done(struct work_struct *wk)
rtnl_unlock();
}
-void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted)
+void cfg80211_scan_done(struct cfg80211_scan_request *request,
+ struct cfg80211_scan_info *info)
{
- trace_cfg80211_scan_done(request, aborted);
+ trace_cfg80211_scan_done(request, info);
WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req);
- request->aborted = aborted;
+ request->info = *info;
request->notified = true;
queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk);
}
@@ -843,6 +845,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
found->pub.capability = tmp->pub.capability;
found->ts = tmp->ts;
found->ts_boottime = tmp->ts_boottime;
+ found->parent_tsf = tmp->parent_tsf;
+ ether_addr_copy(found->parent_bssid, tmp->parent_bssid);
} else {
struct cfg80211_internal_bss *new;
struct cfg80211_internal_bss *hidden;
@@ -1086,6 +1090,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
tmp.ts_boottime = data->boottime_ns;
+ tmp.parent_tsf = data->parent_tsf;
+ ether_addr_copy(tmp.parent_bssid, data->parent_bssid);
signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 584fdc347..add6824c4 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -244,9 +244,7 @@ void cfg80211_conn_work(struct work_struct *work)
if (cfg80211_conn_do_work(wdev)) {
__cfg80211_connect_result(
wdev->netdev, bssid,
- NULL, 0, NULL, 0,
- WLAN_STATUS_UNSPECIFIED_FAILURE,
- false, NULL);
+ NULL, 0, NULL, 0, -1, false, NULL);
}
wdev_unlock(wdev);
}
@@ -648,7 +646,7 @@ static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
const u8 *resp_ie, size_t resp_ie_len,
- u16 status, bool wextev,
+ int status, bool wextev,
struct cfg80211_bss *bss)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -757,7 +755,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
struct cfg80211_bss *bss, const u8 *req_ie,
size_t req_ie_len, const u8 *resp_ie,
- size_t resp_ie_len, u16 status, gfp_t gfp)
+ size_t resp_ie_len, int status, gfp_t gfp)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 3c1091ae6..72b5255ce 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2642,8 +2642,9 @@ TRACE_EVENT(cfg80211_tdls_oper_request,
);
TRACE_EVENT(cfg80211_scan_done,
- TP_PROTO(struct cfg80211_scan_request *request, bool aborted),
- TP_ARGS(request, aborted),
+ TP_PROTO(struct cfg80211_scan_request *request,
+ struct cfg80211_scan_info *info),
+ TP_ARGS(request, info),
TP_STRUCT__entry(
__field(u32, n_channels)
__dynamic_array(u8, ie, request ? request->ie_len : 0)
@@ -2652,6 +2653,8 @@ TRACE_EVENT(cfg80211_scan_done,
MAC_ENTRY(wiphy_mac)
__field(bool, no_cck)
__field(bool, aborted)
+ __field(u64, scan_start_tsf)
+ MAC_ENTRY(tsf_bssid)
),
TP_fast_assign(
if (request) {
@@ -2666,9 +2669,16 @@ TRACE_EVENT(cfg80211_scan_done,
request->wiphy->perm_addr);
__entry->no_cck = request->no_cck;
}
- __entry->aborted = aborted;
+ if (info) {
+ __entry->aborted = info->aborted;
+ __entry->scan_start_tsf = info->scan_start_tsf;
+ MAC_ASSIGN(tsf_bssid, info->tsf_bssid);
+ }
),
- TP_printk("aborted: %s", BOOL_TO_STR(__entry->aborted))
+ TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: " MAC_PR_FMT,
+ BOOL_TO_STR(__entry->aborted),
+ (unsigned long long)__entry->scan_start_tsf,
+ MAC_PR_ARG(tsf_bssid))
);
DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_results,
@@ -2721,6 +2731,8 @@ TRACE_EVENT(cfg80211_inform_bss_frame,
__dynamic_array(u8, mgmt, len)
__field(s32, signal)
__field(u64, ts_boottime)
+ __field(u64, parent_tsf)
+ MAC_ENTRY(parent_bssid)
),
TP_fast_assign(
WIPHY_ASSIGN;
@@ -2730,10 +2742,15 @@ TRACE_EVENT(cfg80211_inform_bss_frame,
memcpy(__get_dynamic_array(mgmt), mgmt, len);
__entry->signal = data->signal;
__entry->ts_boottime = data->boottime_ns;
- ),
- TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d, tsb:%llu",
- WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
- __entry->signal, (unsigned long long)__entry->ts_boottime)
+ __entry->parent_tsf = data->parent_tsf;
+ MAC_ASSIGN(parent_bssid, data->parent_bssid);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT
+ "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: "
+ MAC_PR_FMT, WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
+ __entry->signal, (unsigned long long)__entry->ts_boottime,
+ (unsigned long long)__entry->parent_tsf,
+ MAC_PR_ARG(parent_bssid))
);
DECLARE_EVENT_CLASS(cfg80211_bss_evt,
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 1c4ad477c..6e3f0254d 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -207,15 +207,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
family = XFRM_SPI_SKB_CB(skb)->family;
/* if tunnel is present override skb->mark value with tunnel i_key */
- if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) {
- switch (family) {
- case AF_INET:
+ switch (family) {
+ case AF_INET:
+ if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key);
- break;
- case AF_INET6:
+ break;
+ case AF_INET6:
+ if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key);
- break;
- }
+ break;
}
/* Allocate new secpath or COW existing one. */
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b5e665b3c..45f9cf97e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -626,6 +626,10 @@ static void xfrm_hash_rebuild(struct work_struct *work)
/* re-insert all policies by order of creation */
list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+ if (xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) {
+ /* skip socket policies */
+ continue;
+ }
newpos = NULL;
chain = policy_hash_bysel(net, &policy->selector,
policy->family,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9895a8c56..a30f898dc 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -332,6 +332,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
{
tasklet_hrtimer_cancel(&x->mtimer);
del_timer_sync(&x->rtimer);
+ kfree(x->aead);
kfree(x->aalg);
kfree(x->ealg);
kfree(x->calg);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d516845e1..08892091c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -581,9 +581,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
if (err)
goto error;
- if (attrs[XFRMA_SEC_CTX] &&
- security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])))
- goto error;
+ if (attrs[XFRMA_SEC_CTX]) {
+ err = security_xfrm_state_alloc(x,
+ nla_data(attrs[XFRMA_SEC_CTX]));
+ if (err)
+ goto error;
+ }
if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
attrs[XFRMA_REPLAY_ESN_VAL])))
@@ -896,7 +899,8 @@ static int xfrm_dump_sa_done(struct netlink_callback *cb)
struct sock *sk = cb->skb->sk;
struct net *net = sock_net(sk);
- xfrm_state_walk_done(walk, net);
+ if (cb->args[0])
+ xfrm_state_walk_done(walk, net);
return 0;
}
@@ -921,8 +925,6 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
u8 proto = 0;
int err;
- cb->args[0] = 1;
-
err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX,
xfrma_policy);
if (err < 0)
@@ -939,6 +941,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
proto = nla_get_u8(attrs[XFRMA_PROTO]);
xfrm_state_walk_init(walk, proto, filter);
+ cb->args[0] = 1;
}
(void) xfrm_state_walk(net, walk, dump_one_state, &info);
@@ -2051,9 +2054,6 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
if (up->hard) {
xfrm_policy_delete(xp, p->dir);
xfrm_audit_policy_delete(xp, 1, true);
- } else {
- // reset the timers here?
- WARN(1, "Don't know what to do with soft policy expire\n");
}
km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);
@@ -2117,7 +2117,7 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
err = verify_newpolicy_info(&ua->policy);
if (err)
- goto bad_policy;
+ goto free_state;
/* build an XP */
xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);
@@ -2149,8 +2149,6 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
-bad_policy:
- WARN(1, "BAD policy passed\n");
free_state:
kfree(x);
nomem: