From d0b2f91bede3bd5e3d24dd6803e56eee959c1797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Thu, 20 Oct 2016 00:10:27 -0300 Subject: Linux-libre 4.8.2-gnu --- net/6lowpan/6lowpan_i.h | 4 + net/6lowpan/Makefile | 2 +- net/6lowpan/core.c | 50 +- net/6lowpan/debugfs.c | 39 + net/6lowpan/iphc.c | 167 ++- net/6lowpan/ndisc.c | 241 ++++ net/8021q/vlan.c | 2 +- net/8021q/vlan_dev.c | 2 + net/9p/trans_virtio.c | 4 +- net/Kconfig | 1 + net/Makefile | 1 + net/atm/clip.c | 2 +- net/batman-adv/Kconfig | 2 +- net/batman-adv/Makefile | 5 + net/batman-adv/bat_algo.c | 140 ++ net/batman-adv/bat_algo.h | 32 +- net/batman-adv/bat_iv_ogm.c | 106 +- net/batman-adv/bat_iv_ogm.h | 25 + net/batman-adv/bat_v.c | 58 +- net/batman-adv/bat_v.h | 52 + net/batman-adv/bat_v_elp.c | 9 +- net/batman-adv/bat_v_elp.h | 4 +- net/batman-adv/bat_v_ogm.c | 9 +- net/batman-adv/bat_v_ogm.h | 4 +- net/batman-adv/bitarray.c | 2 + net/batman-adv/bridge_loop_avoidance.c | 1 + net/batman-adv/debugfs.c | 240 +--- net/batman-adv/distributed-arp-table.c | 2 + net/batman-adv/fragmentation.c | 53 +- net/batman-adv/fragmentation.h | 6 +- net/batman-adv/gateway_client.c | 16 +- net/batman-adv/gateway_common.c | 10 +- net/batman-adv/hard-interface.c | 25 +- net/batman-adv/icmp_socket.c | 1 + net/batman-adv/log.c | 231 +++ net/batman-adv/log.h | 111 ++ net/batman-adv/main.c | 709 +-------- net/batman-adv/main.h | 121 +- net/batman-adv/multicast.c | 501 ++++++- net/batman-adv/multicast.h | 3 + net/batman-adv/netlink.c | 424 ++++++ net/batman-adv/netlink.h | 32 + net/batman-adv/network-coding.c | 2 + net/batman-adv/originator.c | 91 +- net/batman-adv/originator.h | 6 +- net/batman-adv/packet.h | 61 +- net/batman-adv/routing.c | 73 +- net/batman-adv/send.c | 102 +- net/batman-adv/send.h | 4 +- net/batman-adv/soft-interface.c | 13 +- net/batman-adv/sysfs.c | 29 +- net/batman-adv/tp_meter.c | 1507 ++++++++++++++++++++ net/batman-adv/tp_meter.h | 34 + net/batman-adv/translation-table.c | 10 +- net/batman-adv/tvlv.c | 632 ++++++++ net/batman-adv/tvlv.h | 61 + net/batman-adv/types.h | 258 +++- net/bluetooth/6lowpan.c | 13 +- net/bluetooth/af_bluetooth.c | 5 + net/bluetooth/hci_conn.c | 2 +- net/bluetooth/hci_core.c | 52 +- net/bluetooth/hci_debugfs.c | 35 + net/bluetooth/hci_event.c | 18 +- net/bluetooth/hci_request.c | 2 + net/bluetooth/hci_sock.c | 7 +- net/bluetooth/hci_sysfs.c | 99 -- net/bluetooth/l2cap_core.c | 10 +- net/bluetooth/l2cap_sock.c | 14 +- net/bluetooth/mgmt.c | 18 +- net/bluetooth/smp.c | 67 +- net/bridge/br_device.c | 34 +- net/bridge/br_fdb.c | 52 +- net/bridge/br_forward.c | 203 +-- net/bridge/br_if.c | 9 +- net/bridge/br_input.c | 72 +- net/bridge/br_multicast.c | 243 +++- net/bridge/br_netlink.c | 148 +- net/bridge/br_private.h | 73 +- net/bridge/br_stp.c | 2 +- net/bridge/br_stp_if.c | 2 +- net/bridge/br_sysfs_br.c | 25 + net/bridge/netfilter/ebt_802_3.c | 6 +- net/bridge/netfilter/ebt_arp.c | 43 +- net/bridge/netfilter/ebt_ip.c | 28 +- net/bridge/netfilter/ebt_ip6.c | 41 +- net/bridge/netfilter/ebt_stp.c | 97 +- net/bridge/netfilter/ebtables.c | 34 +- net/bridge/netfilter/nft_meta_bridge.c | 1 + net/bridge/netfilter/nft_reject_bridge.c | 8 +- net/caif/chnl_net.c | 1 - net/can/Makefile | 3 +- net/can/af_can.c | 22 +- net/can/bcm.c | 309 ++-- net/can/proc.c | 3 +- net/ceph/Makefile | 2 +- net/ceph/ceph_common.c | 2 + net/ceph/ceph_fs.c | 30 +- net/ceph/debugfs.c | 12 +- net/ceph/mon_client.c | 6 +- net/ceph/msgpool.c | 1 + net/ceph/osd_client.c | 49 +- net/ceph/osdmap.c | 58 +- net/ceph/string_table.c | 105 ++ net/core/dev.c | 198 ++- net/core/devlink.c | 91 ++ net/core/drop_monitor.c | 3 +- net/core/ethtool.c | 1 + net/core/fib_rules.c | 82 +- net/core/filter.c | 523 ++++++- net/core/flow_dissector.c | 6 +- net/core/gen_estimator.c | 24 +- net/core/gen_stats.c | 35 +- net/core/neighbour.c | 13 +- net/core/net-sysfs.c | 15 +- net/core/netpoll.c | 2 +- net/core/pktgen.c | 43 +- net/core/rtnetlink.c | 156 +- net/core/skbuff.c | 46 +- net/core/utils.c | 8 +- net/dccp/ipv6.c | 12 +- net/dsa/Makefile | 2 +- net/dsa/dsa.c | 259 ++-- net/dsa/dsa2.c | 695 +++++++++ net/dsa/dsa_priv.h | 9 +- net/dsa/slave.c | 121 +- net/dsa/tag_brcm.c | 4 +- net/dsa/tag_dsa.c | 10 +- net/dsa/tag_edsa.c | 10 +- net/dsa/tag_trailer.c | 4 +- net/ieee802154/6lowpan/core.c | 30 +- net/ieee802154/6lowpan/rx.c | 2 +- net/ieee802154/6lowpan/tx.c | 113 +- net/ieee802154/core.c | 70 +- net/ieee802154/core.h | 2 + net/ieee802154/nl802154.c | 54 +- net/ipv4/Kconfig | 16 + net/ipv4/Makefile | 1 + net/ipv4/af_inet.c | 5 +- net/ipv4/cipso_ipv4.c | 88 +- net/ipv4/devinet.c | 23 +- net/ipv4/fib_frontend.c | 3 +- net/ipv4/fib_rules.c | 6 +- net/ipv4/fib_semantics.c | 8 +- net/ipv4/fib_trie.c | 4 +- net/ipv4/fou.c | 81 +- net/ipv4/gre_demux.c | 1 + net/ipv4/inet_connection_sock.c | 7 +- net/ipv4/inet_diag.c | 25 + net/ipv4/inet_fragment.c | 2 +- net/ipv4/inet_timewait_sock.c | 5 +- net/ipv4/ip_forward.c | 4 +- net/ipv4/ip_gre.c | 52 +- net/ipv4/ip_input.c | 5 +- net/ipv4/ip_output.c | 8 +- net/ipv4/ip_tunnel.c | 2 +- net/ipv4/ip_tunnel_core.c | 11 + net/ipv4/ip_vti.c | 15 +- net/ipv4/ipip.c | 137 +- net/ipv4/ipmr.c | 21 +- net/ipv4/netfilter/arp_tables.c | 88 +- net/ipv4/netfilter/ip_tables.c | 65 +- net/ipv4/netfilter/iptable_mangle.c | 4 - .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 14 +- net/ipv4/netfilter/nf_reject_ipv4.c | 3 + net/ipv4/netfilter/nft_chain_route_ipv4.c | 11 +- net/ipv4/netfilter/nft_reject_ipv4.c | 1 + net/ipv4/route.c | 13 +- net/ipv4/tcp.c | 69 +- net/ipv4/tcp_dctcp.c | 4 +- net/ipv4/tcp_diag.c | 7 +- net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 39 +- net/ipv4/tcp_ipv4.c | 31 +- net/ipv4/tcp_nv.c | 476 +++++++ net/ipv4/tcp_output.c | 23 +- net/ipv4/tcp_timer.c | 82 +- net/ipv4/tunnel4.c | 72 +- net/ipv4/udp.c | 1 - net/ipv4/udp_tunnel.c | 61 + net/ipv4/udplite.c | 1 - net/ipv4/xfrm4_policy.c | 10 +- net/ipv6/Makefile | 1 + net/ipv6/addrconf.c | 325 +++-- net/ipv6/af_inet6.c | 15 +- net/ipv6/calipso.c | 1475 +++++++++++++++++++ net/ipv6/exthdrs.c | 76 + net/ipv6/exthdrs_core.c | 2 +- net/ipv6/fib6_rules.c | 6 +- net/ipv6/icmp.c | 78 +- net/ipv6/ila/ila.h | 3 +- net/ipv6/ila/ila_common.c | 16 +- net/ipv6/ila/ila_lwt.c | 4 +- net/ipv6/ila/ila_xlat.c | 8 +- net/ipv6/ip6_gre.c | 3 - net/ipv6/ip6_icmp.c | 2 +- net/ipv6/ip6_input.c | 1 + net/ipv6/ip6_output.c | 14 +- net/ipv6/ip6_vti.c | 19 +- net/ipv6/ip6mr.c | 34 +- net/ipv6/ipv6_sockglue.c | 1 - net/ipv6/ndisc.c | 123 +- net/ipv6/netfilter/ip6_tables.c | 61 +- net/ipv6/netfilter/ip6table_mangle.c | 4 - net/ipv6/netfilter/nft_chain_route_ipv6.c | 10 +- net/ipv6/netfilter/nft_reject_ipv6.c | 1 + net/ipv6/ping.c | 35 +- net/ipv6/raw.c | 8 +- net/ipv6/route.c | 47 +- net/ipv6/sit.c | 140 +- net/ipv6/sysctl_net_ipv6.c | 19 + net/ipv6/tcp_ipv6.c | 41 +- net/ipv6/udp.c | 9 +- net/ipv6/udplite.c | 1 - net/ipv6/xfrm6_input.c | 15 +- net/ipv6/xfrm6_policy.c | 6 +- net/ipv6/xfrm6_tunnel.c | 2 +- net/irda/af_irda.c | 5 +- net/irda/ircomm/ircomm_tty_ioctl.c | 8 - net/iucv/af_iucv.c | 228 +-- net/iucv/iucv.c | 36 +- net/kcm/kcmproc.c | 6 +- net/kcm/kcmsock.c | 8 +- net/l2tp/l2tp_core.c | 3 + net/l2tp/l2tp_eth.c | 4 +- net/l2tp/l2tp_ip6.c | 8 +- net/l2tp/l2tp_ppp.c | 9 +- net/l3mdev/l3mdev.c | 64 +- net/mac80211/agg-rx.c | 18 + net/mac80211/agg-tx.c | 8 +- net/mac80211/cfg.c | 1 + net/mac80211/debugfs.c | 173 +++ net/mac80211/debugfs_sta.c | 78 +- net/mac80211/driver-ops.h | 2 +- net/mac80211/ieee80211_i.h | 32 +- net/mac80211/iface.c | 26 +- net/mac80211/main.c | 10 +- net/mac80211/mesh.c | 30 +- net/mac80211/mesh_hwmp.c | 3 +- net/mac80211/mesh_pathtbl.c | 2 +- net/mac80211/mesh_plink.c | 16 +- net/mac80211/rx.c | 9 +- net/mac80211/scan.c | 42 +- net/mac80211/spectmgmt.c | 45 +- net/mac80211/sta_info.c | 16 +- net/mac80211/status.c | 14 +- net/mac80211/tdls.c | 8 +- net/mac80211/tx.c | 335 ++++- net/mac80211/util.c | 34 +- net/mpls/af_mpls.c | 11 +- net/ncsi/Kconfig | 12 + net/ncsi/Makefile | 4 + net/ncsi/internal.h | 328 +++++ net/ncsi/ncsi-aen.c | 193 +++ net/ncsi/ncsi-cmd.c | 367 +++++ net/ncsi/ncsi-manage.c | 1205 ++++++++++++++++ net/ncsi/ncsi-pkt.h | 415 ++++++ net/ncsi/ncsi-rsp.c | 1035 ++++++++++++++ net/netfilter/Kconfig | 10 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 25 +- net/netfilter/nf_conntrack_core.c | 220 +-- net/netfilter/nf_conntrack_expect.c | 2 +- net/netfilter/nf_conntrack_extend.c | 15 +- net/netfilter/nf_conntrack_ftp.c | 58 +- net/netfilter/nf_conntrack_h323_asn1.c | 3 +- net/netfilter/nf_conntrack_h323_main.c | 16 +- net/netfilter/nf_conntrack_helper.c | 127 +- net/netfilter/nf_conntrack_irc.c | 36 +- net/netfilter/nf_conntrack_labels.c | 28 +- net/netfilter/nf_conntrack_netlink.c | 20 +- net/netfilter/nf_conntrack_sane.c | 57 +- net/netfilter/nf_conntrack_sip.c | 79 +- net/netfilter/nf_conntrack_standalone.c | 54 +- net/netfilter/nf_conntrack_tftp.c | 48 +- net/netfilter/nf_log.c | 33 +- net/netfilter/nf_nat_core.c | 154 +- net/netfilter/nf_tables_api.c | 414 +++--- net/netfilter/nf_tables_netdev.c | 1 - net/netfilter/nf_tables_trace.c | 2 +- net/netfilter/nfnetlink_acct.c | 23 +- net/netfilter/nfnetlink_cttimeout.c | 85 +- net/netfilter/nfnetlink_log.c | 10 +- net/netfilter/nfnetlink_queue.c | 6 +- net/netfilter/nft_compat.c | 75 +- net/netfilter/nft_ct.c | 41 +- net/netfilter/nft_dynset.c | 7 +- net/netfilter/nft_exthdr.c | 11 +- net/netfilter/nft_hash.c | 24 +- net/netfilter/nft_log.c | 51 +- net/netfilter/nft_lookup.c | 43 +- net/netfilter/nft_meta.c | 26 +- net/netfilter/nft_rbtree.c | 36 +- net/netfilter/nft_reject.c | 16 + net/netfilter/nft_reject_inet.c | 7 +- net/netfilter/x_tables.c | 53 + net/netfilter/xt_NFLOG.c | 3 + net/netfilter/xt_RATEEST.c | 2 +- net/netfilter/xt_TPROXY.c | 4 + net/netfilter/xt_TRACE.c | 25 +- net/netfilter/xt_connlabel.c | 29 +- net/netfilter/xt_nfacct.c | 2 +- net/netfilter/xt_owner.c | 41 +- net/netfilter/xt_physdev.c | 8 +- net/netfilter/xt_tcpudp.c | 7 +- net/netlabel/Kconfig | 1 + net/netlabel/Makefile | 2 +- net/netlabel/netlabel_calipso.c | 740 ++++++++++ net/netlabel/netlabel_calipso.h | 151 ++ net/netlabel/netlabel_domainhash.c | 293 +++- net/netlabel/netlabel_domainhash.h | 17 +- net/netlabel/netlabel_kapi.c | 382 ++++- net/netlabel/netlabel_mgmt.c | 85 +- net/netlabel/netlabel_mgmt.h | 27 +- net/netlabel/netlabel_unlabeled.c | 5 +- net/netlabel/netlabel_user.c | 5 + net/netlink/af_netlink.h | 14 - net/nfc/digital_core.c | 28 +- net/nfc/digital_dep.c | 316 ++-- net/nfc/digital_technology.c | 11 +- net/nfc/hci/llc.c | 17 +- net/nfc/llcp_commands.c | 23 +- net/nfc/llcp_core.c | 9 +- net/openvswitch/actions.c | 40 +- net/openvswitch/conntrack.c | 81 +- net/openvswitch/datapath.c | 42 +- net/openvswitch/datapath.h | 5 +- net/openvswitch/flow_netlink.c | 9 + net/openvswitch/vport-geneve.c | 9 +- net/openvswitch/vport-gre.c | 11 +- net/openvswitch/vport-internal_dev.c | 4 +- net/openvswitch/vport-vxlan.c | 9 +- net/openvswitch/vport.c | 1 + net/packet/af_packet.c | 42 +- net/rds/bind.c | 6 + net/rds/cong.c | 3 +- net/rds/connection.c | 329 +++-- net/rds/ib.c | 9 +- net/rds/ib.h | 8 +- net/rds/ib_cm.c | 9 +- net/rds/ib_rdma.c | 3 +- net/rds/ib_recv.c | 4 +- net/rds/ib_send.c | 4 +- net/rds/loop.c | 15 +- net/rds/message.c | 1 + net/rds/rdma_transport.c | 1 + net/rds/rds.h | 178 ++- net/rds/rds_single_path.h | 30 + net/rds/recv.c | 106 +- net/rds/send.c | 356 +++-- net/rds/tcp.c | 160 ++- net/rds/tcp.h | 23 +- net/rds/tcp_connect.c | 43 +- net/rds/tcp_listen.c | 76 +- net/rds/tcp_recv.c | 38 +- net/rds/tcp_send.c | 39 +- net/rds/threads.c | 105 +- net/rxrpc/Makefile | 37 +- net/rxrpc/af_rxrpc.c | 287 ++-- net/rxrpc/ar-internal.h | 516 ++++--- net/rxrpc/call_accept.c | 477 +++++++ net/rxrpc/call_event.c | 1302 +++++++++++++++++ net/rxrpc/call_object.c | 801 +++++++++++ net/rxrpc/conn_client.c | 372 +++++ net/rxrpc/conn_event.c | 394 +++++ net/rxrpc/conn_object.c | 340 +++++ net/rxrpc/conn_service.c | 230 +++ net/rxrpc/input.c | 757 ++++++++++ net/rxrpc/insecure.c | 7 +- net/rxrpc/key.c | 1237 ++++++++++++++++ net/rxrpc/local_event.c | 116 ++ net/rxrpc/local_object.c | 390 +++++ net/rxrpc/misc.c | 6 + net/rxrpc/output.c | 721 ++++++++++ net/rxrpc/peer_event.c | 281 ++++ net/rxrpc/peer_object.c | 315 ++++ net/rxrpc/proc.c | 192 +++ net/rxrpc/recvmsg.c | 417 ++++++ net/rxrpc/rxkad.c | 263 ++-- net/rxrpc/security.c | 168 +++ net/rxrpc/skbuff.c | 165 +++ net/rxrpc/sysctl.c | 12 +- net/rxrpc/utils.c | 46 + net/sched/Kconfig | 10 + net/sched/Makefile | 1 + net/sched/act_api.c | 306 ++-- net/sched/act_bpf.c | 41 +- net/sched/act_connmark.c | 30 +- net/sched/act_csum.c | 29 +- net/sched/act_gact.c | 31 +- net/sched/act_ife.c | 58 +- net/sched/act_ipt.c | 67 +- net/sched/act_mirred.c | 35 +- net/sched/act_nat.c | 29 +- net/sched/act_pedit.c | 36 +- net/sched/act_police.c | 105 +- net/sched/act_simple.c | 39 +- net/sched/act_skbedit.c | 62 +- net/sched/act_vlan.c | 41 +- net/sched/cls_api.c | 99 +- net/sched/cls_bpf.c | 7 +- net/sched/cls_flower.c | 65 +- net/sched/cls_matchall.c | 318 +++++ net/sched/sch_api.c | 30 +- net/sched/sch_atm.c | 33 +- net/sched/sch_blackhole.c | 5 +- net/sched/sch_cbq.c | 305 +--- net/sched/sch_choke.c | 41 +- net/sched/sch_codel.c | 10 +- net/sched/sch_drr.c | 38 +- net/sched/sch_dsmark.c | 27 +- net/sched/sch_fifo.c | 18 +- net/sched/sch_fq.c | 29 +- net/sched/sch_fq_codel.c | 64 +- net/sched/sch_generic.c | 99 +- net/sched/sch_gred.c | 42 +- net/sched/sch_hfsc.c | 108 +- net/sched/sch_hhf.c | 24 +- net/sched/sch_htb.c | 68 +- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 11 +- net/sched/sch_multiq.c | 32 +- net/sched/sch_netem.c | 73 +- net/sched/sch_pie.c | 7 +- net/sched/sch_plug.c | 19 +- net/sched/sch_prio.c | 27 +- net/sched/sch_qfq.c | 66 +- net/sched/sch_red.c | 28 +- net/sched/sch_sfb.c | 10 +- net/sched/sch_sfq.c | 11 +- net/sched/sch_tbf.c | 34 +- net/sched/sch_teql.c | 4 +- net/sctp/Makefile | 3 +- net/sctp/associola.c | 1 + net/sctp/chunk.c | 30 +- net/sctp/endpointola.c | 1 + net/sctp/input.c | 96 +- net/sctp/inqueue.c | 72 +- net/sctp/ipv6.c | 15 +- net/sctp/offload.c | 119 ++ net/sctp/output.c | 390 +++-- net/sctp/outqueue.c | 99 ++ net/sctp/protocol.c | 9 +- net/sctp/sctp_diag.c | 76 +- net/sctp/sm_make_chunk.c | 32 +- net/sctp/sm_sideeffect.c | 4 +- net/sctp/sm_statefuns.c | 9 +- net/sctp/socket.c | 302 +++- net/sctp/ulpevent.c | 17 +- net/sctp/ulpqueue.c | 4 +- net/sunrpc/auth.c | 8 +- net/sunrpc/auth_generic.c | 9 +- net/sunrpc/auth_gss/auth_gss.c | 3 + net/sunrpc/auth_gss/gss_krb5_mech.c | 2 + net/sunrpc/auth_gss/gss_mech_switch.c | 12 + net/sunrpc/auth_gss/svcauth_gss.c | 5 +- net/sunrpc/auth_null.c | 1 + net/sunrpc/auth_unix.c | 1 + net/sunrpc/cache.c | 2 +- net/sunrpc/clnt.c | 30 +- net/sunrpc/rpc_pipe.c | 8 +- net/sunrpc/sched.c | 67 +- net/sunrpc/svc.c | 8 +- net/sunrpc/svc_xprt.c | 51 +- net/sunrpc/svcsock.c | 150 +- net/sunrpc/xprt.c | 40 +- net/sunrpc/xprtrdma/Makefile | 2 +- net/sunrpc/xprtrdma/fmr_ops.c | 378 +++-- net/sunrpc/xprtrdma/frwr_ops.c | 318 ++--- net/sunrpc/xprtrdma/rpc_rdma.c | 274 ++-- net/sunrpc/xprtrdma/transport.c | 40 +- net/sunrpc/xprtrdma/verbs.c | 271 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 111 +- net/sunrpc/xprtsock.c | 177 ++- net/switchdev/switchdev.c | 5 +- net/tipc/Makefile | 2 +- net/tipc/addr.h | 6 +- net/tipc/bearer.c | 33 +- net/tipc/bearer.h | 3 +- net/tipc/core.c | 1 + net/tipc/core.h | 15 +- net/tipc/discover.c | 5 +- net/tipc/link.c | 51 +- net/tipc/monitor.c | 804 +++++++++++ net/tipc/monitor.h | 82 ++ net/tipc/name_distr.c | 8 +- net/tipc/netlink.c | 27 +- net/tipc/netlink.h | 1 + net/tipc/node.c | 213 ++- net/tipc/node.h | 5 + net/tipc/server.c | 3 +- net/tipc/udp_media.c | 29 +- net/unix/af_unix.c | 1 + net/vmw_vsock/Kconfig | 20 + net/vmw_vsock/Makefile | 6 + net/vmw_vsock/af_vsock.c | 31 +- net/vmw_vsock/virtio_transport.c | 620 ++++++++ net/vmw_vsock/virtio_transport_common.c | 992 +++++++++++++ net/vmw_vsock/vmci_transport.c | 2 + net/wireless/chan.c | 3 +- net/wireless/core.c | 34 +- net/wireless/core.h | 16 +- net/wireless/nl80211.c | 437 ++++-- net/wireless/nl80211.h | 2 +- net/wireless/scan.c | 18 +- net/wireless/sme.c | 8 +- net/wireless/trace.h | 33 +- net/xfrm/xfrm_input.c | 14 +- net/xfrm/xfrm_policy.c | 4 + net/xfrm/xfrm_state.c | 1 + net/xfrm/xfrm_user.c | 22 +- 509 files changed, 36609 insertions(+), 8783 deletions(-) create mode 100644 net/6lowpan/ndisc.c create mode 100644 net/batman-adv/bat_algo.c create mode 100644 net/batman-adv/bat_iv_ogm.h create mode 100644 net/batman-adv/bat_v.h create mode 100644 net/batman-adv/log.c create mode 100644 net/batman-adv/log.h create mode 100644 net/batman-adv/netlink.c create mode 100644 net/batman-adv/netlink.h create mode 100644 net/batman-adv/tp_meter.c create mode 100644 net/batman-adv/tp_meter.h create mode 100644 net/batman-adv/tvlv.c create mode 100644 net/batman-adv/tvlv.h create mode 100644 net/ceph/string_table.c create mode 100644 net/dsa/dsa2.c create mode 100644 net/ipv4/tcp_nv.c create mode 100644 net/ipv6/calipso.c create mode 100644 net/ncsi/Kconfig create mode 100644 net/ncsi/Makefile create mode 100644 net/ncsi/internal.h create mode 100644 net/ncsi/ncsi-aen.c create mode 100644 net/ncsi/ncsi-cmd.c create mode 100644 net/ncsi/ncsi-manage.c create mode 100644 net/ncsi/ncsi-pkt.h create mode 100644 net/ncsi/ncsi-rsp.c create mode 100644 net/netlabel/netlabel_calipso.c create mode 100644 net/netlabel/netlabel_calipso.h create mode 100644 net/rds/rds_single_path.h create mode 100644 net/rxrpc/call_accept.c create mode 100644 net/rxrpc/call_event.c create mode 100644 net/rxrpc/call_object.c create mode 100644 net/rxrpc/conn_client.c create mode 100644 net/rxrpc/conn_event.c create mode 100644 net/rxrpc/conn_object.c create mode 100644 net/rxrpc/conn_service.c create mode 100644 net/rxrpc/input.c create mode 100644 net/rxrpc/key.c create mode 100644 net/rxrpc/local_event.c create mode 100644 net/rxrpc/local_object.c create mode 100644 net/rxrpc/output.c create mode 100644 net/rxrpc/peer_event.c create mode 100644 net/rxrpc/peer_object.c create mode 100644 net/rxrpc/proc.c create mode 100644 net/rxrpc/recvmsg.c create mode 100644 net/rxrpc/security.c create mode 100644 net/rxrpc/skbuff.c create mode 100644 net/rxrpc/utils.c create mode 100644 net/sched/cls_matchall.c create mode 100644 net/sctp/offload.c create mode 100644 net/tipc/monitor.c create mode 100644 net/tipc/monitor.h create mode 100644 net/vmw_vsock/virtio_transport.c create mode 100644 net/vmw_vsock/virtio_transport_common.c (limited to 'net') diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h index 97ecc27ae..a67caee11 100644 --- a/net/6lowpan/6lowpan_i.h +++ b/net/6lowpan/6lowpan_i.h @@ -12,6 +12,10 @@ static inline bool lowpan_is_ll(const struct net_device *dev, return lowpan_dev(dev)->lltype == lltype; } +extern const struct ndisc_ops lowpan_ndisc_ops; + +int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev); + #ifdef CONFIG_6LOWPAN_DEBUGFS int lowpan_dev_debugfs_init(struct net_device *dev); void lowpan_dev_debugfs_exit(struct net_device *dev); diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile index e44f3bf2d..12d131ab2 100644 --- a/net/6lowpan/Makefile +++ b/net/6lowpan/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_6LOWPAN) += 6lowpan.o -6lowpan-y := core.o iphc.o nhc.o +6lowpan-y := core.o iphc.o nhc.o ndisc.o 6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o #rfc6282 nhcs diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 7a240b3ea..5945f7e19 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -14,6 +14,7 @@ #include #include +#include #include "6lowpan_i.h" @@ -33,6 +34,8 @@ int lowpan_register_netdevice(struct net_device *dev, for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) lowpan_dev(dev)->ctx.table[i].id = i; + dev->ndisc_ops = &lowpan_ndisc_ops; + ret = register_netdevice(dev); if (ret < 0) return ret; @@ -72,16 +75,61 @@ void lowpan_unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(lowpan_unregister_netdev); +int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) +{ + struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + /* Set short_addr autoconfiguration if short_addr is present only */ + if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) + return -1; + + /* For either address format, all zero addresses MUST NOT be used */ + if (wpan_dev->pan_id == cpu_to_le16(0x0000) && + wpan_dev->short_addr == cpu_to_le16(0x0000)) + return -1; + + /* Alternatively, if no PAN ID is known, 16 zero bits may be used */ + if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) + memset(eui, 0, 2); + else + ieee802154_le16_to_be16(eui, &wpan_dev->pan_id); + + /* The "Universal/Local" (U/L) bit shall be set to zero */ + eui[0] &= ~2; + eui[2] = 0; + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[5] = 0; + ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr); + return 0; +} + static int lowpan_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct inet6_dev *idev; + struct in6_addr addr; int i; if (dev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; + idev = __in6_dev_get(dev); + if (!idev) + return NOTIFY_DONE; + switch (event) { + case NETDEV_UP: + case NETDEV_CHANGE: + /* (802.15.4 6LoWPAN short address slaac handling */ + if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && + addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) { + __ipv6_addr_set_half(&addr.s6_addr32[0], + htonl(0xFE800000), 0); + addrconf_add_linklocal(idev, &addr, 0); + } + break; case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, @@ -112,8 +160,6 @@ static int __init lowpan_module_init(void) return ret; } - request_module_nowait("ipv6"); - request_module_nowait("nhc_dest"); request_module_nowait("nhc_fragment"); request_module_nowait("nhc_hop"); diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index acbaa3db4..24915e0bb 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -245,6 +245,41 @@ static const struct file_operations lowpan_context_fops = { .release = single_release, }; +static int lowpan_short_addr_get(void *data, u64 *val) +{ + struct wpan_dev *wdev = data; + + rtnl_lock(); + *val = le16_to_cpu(wdev->short_addr); + rtnl_unlock(); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get, + NULL, "0x%04llx\n"); + +static int lowpan_dev_debugfs_802154_init(const struct net_device *dev, + struct lowpan_dev *ldev) +{ + struct dentry *dentry, *root; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + + root = debugfs_create_dir("ieee802154", ldev->iface_debugfs); + if (!root) + return -EINVAL; + + dentry = debugfs_create_file("short_addr", 0444, root, + lowpan_802154_dev(dev)->wdev->ieee802154_ptr, + &lowpan_short_addr_fops); + if (!dentry) + return -EINVAL; + + return 0; +} + int lowpan_dev_debugfs_init(struct net_device *dev) { struct lowpan_dev *ldev = lowpan_dev(dev); @@ -272,6 +307,10 @@ int lowpan_dev_debugfs_init(struct net_device *dev) goto remove_root; } + ret = lowpan_dev_debugfs_802154_init(dev, ldev); + if (ret < 0) + goto remove_root; + return 0; remove_root: diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 8501dd532..79f1fa225 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -761,22 +761,75 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = { [LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11, }; -static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr, +static inline bool +lowpan_iphc_compress_ctx_802154_lladdr(const struct in6_addr *ipaddr, + const struct lowpan_iphc_ctx *ctx, + const void *lladdr) +{ + const struct ieee802154_addr *addr = lladdr; + unsigned char extended_addr[EUI64_ADDR_LEN]; + bool lladdr_compress = false; + struct in6_addr tmp = {}; + + switch (addr->mode) { + case IEEE802154_ADDR_LONG: + ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr); + /* check for SAM/DAM = 11 */ + memcpy(&tmp.s6_addr[8], &extended_addr, EUI64_ADDR_LEN); + /* second bit-flip (Universe/Local) is done according RFC2464 */ + tmp.s6_addr[8] ^= 0x02; + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + case IEEE802154_ADDR_SHORT: + tmp.s6_addr[11] = 0xFF; + tmp.s6_addr[12] = 0xFE; + ieee802154_le16_to_be16(&tmp.s6_addr16[7], + &addr->short_addr); + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + default: + /* should never handled and filtered by 802154 6lowpan */ + WARN_ON_ONCE(1); + break; + } + + return lladdr_compress; +} + +static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev, + const struct in6_addr *ipaddr, const struct lowpan_iphc_ctx *ctx, const unsigned char *lladdr, bool sam) { struct in6_addr tmp = {}; u8 dam; - /* check for SAM/DAM = 11 */ - memcpy(&tmp.s6_addr[8], lladdr, 8); - /* second bit-flip (Universe/Local) is done according RFC2464 */ - tmp.s6_addr[8] ^= 0x02; - /* context information are always used */ - ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); - if (ipv6_addr_equal(&tmp, ipaddr)) { - dam = LOWPAN_IPHC_DAM_11; - goto out; + switch (lowpan_dev(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_iphc_compress_ctx_802154_lladdr(ipaddr, ctx, + lladdr)) { + dam = LOWPAN_IPHC_DAM_11; + goto out; + } + break; + default: + /* check for SAM/DAM = 11 */ + memcpy(&tmp.s6_addr[8], lladdr, EUI64_ADDR_LEN); + /* second bit-flip (Universe/Local) is done according RFC2464 */ + tmp.s6_addr[8] ^= 0x02; + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + dam = LOWPAN_IPHC_DAM_11; + goto out; + } + break; } memset(&tmp, 0, sizeof(tmp)); @@ -813,28 +866,85 @@ out: return dam; } -static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr, +static inline bool +lowpan_iphc_compress_802154_lladdr(const struct in6_addr *ipaddr, + const void *lladdr) +{ + const struct ieee802154_addr *addr = lladdr; + unsigned char extended_addr[EUI64_ADDR_LEN]; + bool lladdr_compress = false; + struct in6_addr tmp = {}; + + switch (addr->mode) { + case IEEE802154_ADDR_LONG: + ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr); + if (is_addr_mac_addr_based(ipaddr, extended_addr)) + lladdr_compress = true; + break; + case IEEE802154_ADDR_SHORT: + /* fe:80::ff:fe00:XXXX + * \__/ + * short_addr + * + * Universe/Local bit is zero. + */ + tmp.s6_addr[0] = 0xFE; + tmp.s6_addr[1] = 0x80; + tmp.s6_addr[11] = 0xFF; + tmp.s6_addr[12] = 0xFE; + ieee802154_le16_to_be16(&tmp.s6_addr16[7], + &addr->short_addr); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + default: + /* should never handled and filtered by 802154 6lowpan */ + WARN_ON_ONCE(1); + break; + } + + return lladdr_compress; +} + +static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct net_device *dev, + const struct in6_addr *ipaddr, const unsigned char *lladdr, bool sam) { - u8 dam = LOWPAN_IPHC_DAM_00; + u8 dam = LOWPAN_IPHC_DAM_01; - if (is_addr_mac_addr_based(ipaddr, lladdr)) { - dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ - pr_debug("address compression 0 bits\n"); - } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) { + switch (lowpan_dev(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_iphc_compress_802154_lladdr(ipaddr, lladdr)) { + dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ + pr_debug("address compression 0 bits\n"); + goto out; + } + break; + default: + if (is_addr_mac_addr_based(ipaddr, lladdr)) { + dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ + pr_debug("address compression 0 bits\n"); + goto out; + } + break; + } + + if (lowpan_is_iid_16_bit_compressable(ipaddr)) { /* compress IID to 16 bits xxxx::XXXX */ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2); dam = LOWPAN_IPHC_DAM_10; /* 16-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)", *hc_ptr - 2, 2); - } else { - /* do not compress IID => xxxx::IID */ - lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8); - dam = LOWPAN_IPHC_DAM_01; /* 64-bits */ - raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", - *hc_ptr - 8, 8); + goto out; } + /* do not compress IID => xxxx::IID */ + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8); + raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", + *hc_ptr - 8, 8); + +out: + if (sam) return lowpan_iphc_dam_to_sam_value[dam]; else @@ -1013,9 +1123,6 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, iphc0 = LOWPAN_DISPATCH_IPHC; iphc1 = 0; - raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN); - raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN); - raw_dump_table(__func__, "sending raw skb network uncompressed packet", skb->data, skb->len); @@ -1088,14 +1195,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, iphc1 |= LOWPAN_IPHC_SAC; } else { if (sci) { - iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr, + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev, + &hdr->saddr, &sci_entry, saddr, true); iphc1 |= LOWPAN_IPHC_SAC; } else { if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL && lowpan_is_linklocal_zero_padded(hdr->saddr)) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, + iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev, &hdr->saddr, saddr, true); pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", @@ -1123,14 +1231,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, } } else { if (dci) { - iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr, + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev, + &hdr->daddr, &dci_entry, daddr, false); iphc1 |= LOWPAN_IPHC_DAC; } else { if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL && lowpan_is_linklocal_zero_padded(hdr->daddr)) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, + iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev, &hdr->daddr, daddr, false); pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n", diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c new file mode 100644 index 000000000..86450b7e2 --- /dev/null +++ b/net/6lowpan/ndisc.c @@ -0,0 +1,241 @@ +/* This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: + * (C) 2016 Pengutronix, Alexander Aring + */ + +#include +#include +#include + +#include "6lowpan_i.h" + +static int lowpan_ndisc_is_useropt(u8 nd_opt_type) +{ + return nd_opt_type == ND_OPT_6CO; +} + +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) +#define NDISC_802154_SHORT_ADDR_LENGTH 1 +static int lowpan_ndisc_parse_802154_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + switch (nd_opt->nd_opt_len) { + case NDISC_802154_SHORT_ADDR_LENGTH: + if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type]) + ND_PRINTK(2, warn, + "%s: duplicated short addr ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); + else + ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt; + return 1; + default: + /* all others will be handled by ndisc IPv6 option parsing */ + return 0; + } +} + +static int lowpan_ndisc_parse_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LL_ADDR: + case ND_OPT_TARGET_LL_ADDR: + return lowpan_ndisc_parse_802154_options(dev, nd_opt, ndopts); + default: + return 0; + } +} + +static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, + u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); + u8 *lladdr_short = NULL; + + switch (icmp6_type) { + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + if (ndopts->nd_802154_opts_src_lladdr) { + lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr, + IEEE802154_SHORT_ADDR_LEN, 0); + if (!lladdr_short) { + ND_PRINTK(2, warn, + "NA: invalid short link-layer address length\n"); + return; + } + } + break; + case NDISC_REDIRECT: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + if (ndopts->nd_802154_opts_tgt_lladdr) { + lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr, + IEEE802154_SHORT_ADDR_LEN, 0); + if (!lladdr_short) { + ND_PRINTK(2, warn, + "NA: invalid short link-layer address length\n"); + return; + } + } + break; + default: + break; + } + + write_lock_bh(&n->lock); + if (lladdr_short) { + ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short); + if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr)) + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + } else { + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + } + write_unlock_bh(&n->lock); +} + +static void lowpan_ndisc_update(const struct net_device *dev, + struct neighbour *n, u32 flags, u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return; + + /* react on overrides only. TODO check if this is really right. */ + if (flags & NEIGH_UPDATE_F_OVERRIDE) + lowpan_ndisc_802154_update(n, flags, icmp6_type, ndopts); +} + +static int lowpan_ndisc_opt_addr_space(const struct net_device *dev, + u8 icmp6_type, struct neighbour *neigh, + u8 *ha_buf, u8 **ha) +{ + struct lowpan_802154_neigh *n; + struct wpan_dev *wpan_dev; + int addr_space = 0; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + + switch (icmp6_type) { + case NDISC_REDIRECT: + n = lowpan_802154_neigh(neighbour_priv(neigh)); + + read_lock_bh(&neigh->lock); + if (lowpan_802154_is_valid_src_short_addr(n->short_addr)) { + memcpy(ha_buf, &n->short_addr, + IEEE802154_SHORT_ADDR_LEN); + read_unlock_bh(&neigh->lock); + addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); + *ha = ha_buf; + } else { + read_unlock_bh(&neigh->lock); + } + break; + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_ROUTER_SOLICITATION: + wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) + addr_space = __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); + break; + default: + break; + } + + return addr_space; +} + +static void lowpan_ndisc_fill_addr_option(const struct net_device *dev, + struct sk_buff *skb, u8 icmp6_type, + const u8 *ha) +{ + struct wpan_dev *wpan_dev; + __be16 short_addr; + u8 opt_type; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return; + + switch (icmp6_type) { + case NDISC_REDIRECT: + if (ha) { + ieee802154_le16_to_be16(&short_addr, ha); + __ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, + &short_addr, + IEEE802154_SHORT_ADDR_LEN, 0); + } + return; + case NDISC_NEIGHBOUR_ADVERTISEMENT: + opt_type = ND_OPT_TARGET_LL_ADDR; + break; + case NDISC_ROUTER_SOLICITATION: + case NDISC_NEIGHBOUR_SOLICITATION: + opt_type = ND_OPT_SOURCE_LL_ADDR; + break; + default: + return; + } + + wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) { + ieee802154_le16_to_be16(&short_addr, + &wpan_dev->short_addr); + __ndisc_fill_addr_option(skb, opt_type, &short_addr, + IEEE802154_SHORT_ADDR_LEN, 0); + } +} + +static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net, + struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, + u32 prefered_lft, + bool dev_addr_generated) +{ + int err; + + /* generates short based address for RA PIO's */ + if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && dev_addr_generated && + !addrconf_ifid_802154_6lowpan(addr->s6_addr + 8, dev)) { + err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + addr, addr_type, addr_flags, + sllao, tokenized, valid_lft, + prefered_lft); + if (err) + ND_PRINTK(2, warn, + "RA: could not add a short address based address for prefix: %pI6c\n", + &pinfo->prefix); + } +} +#endif + +const struct ndisc_ops lowpan_ndisc_ops = { + .is_useropt = lowpan_ndisc_is_useropt, +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) + .parse_options = lowpan_ndisc_parse_options, + .update = lowpan_ndisc_update, + .opt_addr_space = lowpan_ndisc_opt_addr_space, + .fill_addr_option = lowpan_ndisc_fill_addr_option, + .prefix_rcv_add_addr = lowpan_ndisc_prefix_rcv_add_addr, +#endif +}; diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 82a116ba5..8de138d33 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -169,7 +169,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; - vlan->nest_level = dev_get_nest_level(real_dev, is_vlan_dev) + 1; + vlan->nest_level = dev_get_nest_level(real_dev) + 1; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 516b0e732..fbfacd51a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -792,6 +792,8 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, + .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, + .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_fdb_add = switchdev_port_fdb_add, .ndo_fdb_del = switchdev_port_fdb_del, .ndo_fdb_dump = switchdev_port_fdb_dump, diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 4acb1d541..f24b25c25 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -507,8 +507,8 @@ err_out: /* wakeup anybody waiting for slots to pin pages */ wake_up(&vp_wq); } - kfree(in_pages); - kfree(out_pages); + kvfree(in_pages); + kvfree(out_pages); return err; } diff --git a/net/Kconfig b/net/Kconfig index ff40562a7..c2cdbce62 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -237,6 +237,7 @@ source "net/hsr/Kconfig" source "net/switchdev/Kconfig" source "net/l3mdev/Kconfig" source "net/qrtr/Kconfig" +source "net/ncsi/Kconfig" config RPS bool diff --git a/net/Makefile b/net/Makefile index bdd14553a..9bd20bb86 100644 --- a/net/Makefile +++ b/net/Makefile @@ -79,3 +79,4 @@ ifneq ($(CONFIG_NET_L3_MASTER_DEV),) obj-y += l3mdev/ endif obj-$(CONFIG_QRTR) += qrtr/ +obj-$(CONFIG_NET_NCSI) += ncsi/ diff --git a/net/atm/clip.c b/net/atm/clip.c index e07f551a8..53b4ac09e 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -286,7 +286,7 @@ static const struct neigh_ops clip_neigh_ops = { .connected_output = neigh_direct_output, }; -static int clip_constructor(struct neighbour *neigh) +static int clip_constructor(struct net_device *dev, struct neighbour *neigh) { struct atmarp_entry *entry = neighbour_priv(neigh); diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index f66930ee3..833bb145b 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -66,7 +66,7 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" - depends on BATMAN_ADV + depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) default n help This option enables the multicast optimisation which aims to diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 797cf2fc8..a83fc6c58 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -17,6 +17,7 @@ # obj-$(CONFIG_BATMAN_ADV) += batman-adv.o +batman-adv-y += bat_algo.o batman-adv-y += bat_iv_ogm.o batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o @@ -31,12 +32,16 @@ batman-adv-y += gateway_common.o batman-adv-y += hard-interface.o batman-adv-y += hash.o batman-adv-y += icmp_socket.o +batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o batman-adv-y += main.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o +batman-adv-y += netlink.o batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o +batman-adv-y += tp_meter.o batman-adv-y += translation-table.o +batman-adv-y += tvlv.o diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c new file mode 100644 index 000000000..81dbbf569 --- /dev/null +++ b/net/batman-adv/bat_algo.c @@ -0,0 +1,140 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "bat_algo.h" + +char batadv_routing_algo[20] = "BATMAN_IV"; +static struct hlist_head batadv_algo_list; + +/** + * batadv_algo_init - Initialize batman-adv algorithm management data structures + */ +void batadv_algo_init(void) +{ + INIT_HLIST_HEAD(&batadv_algo_list); +} + +static struct batadv_algo_ops *batadv_algo_get(char *name) +{ + struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; + + hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { + if (strcmp(bat_algo_ops_tmp->name, name) != 0) + continue; + + bat_algo_ops = bat_algo_ops_tmp; + break; + } + + return bat_algo_ops; +} + +int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) +{ + struct batadv_algo_ops *bat_algo_ops_tmp; + + bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); + if (bat_algo_ops_tmp) { + pr_info("Trying to register already registered routing algorithm: %s\n", + bat_algo_ops->name); + return -EEXIST; + } + + /* all algorithms must implement all ops (for now) */ + if (!bat_algo_ops->iface.enable || + !bat_algo_ops->iface.disable || + !bat_algo_ops->iface.update_mac || + !bat_algo_ops->iface.primary_set || + !bat_algo_ops->neigh.cmp || + !bat_algo_ops->neigh.is_similar_or_better) { + pr_info("Routing algo '%s' does not implement required ops\n", + bat_algo_ops->name); + return -EINVAL; + } + + INIT_HLIST_NODE(&bat_algo_ops->list); + hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); + + return 0; +} + +int batadv_algo_select(struct batadv_priv *bat_priv, char *name) +{ + struct batadv_algo_ops *bat_algo_ops; + + bat_algo_ops = batadv_algo_get(name); + if (!bat_algo_ops) + return -EINVAL; + + bat_priv->algo_ops = bat_algo_ops; + + return 0; +} + +int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) +{ + struct batadv_algo_ops *bat_algo_ops; + + seq_puts(seq, "Available routing algorithms:\n"); + + hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { + seq_printf(seq, " * %s\n", bat_algo_ops->name); + } + + return 0; +} + +static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) +{ + struct batadv_algo_ops *bat_algo_ops; + char *algo_name = (char *)val; + size_t name_len = strlen(algo_name); + + if (name_len > 0 && algo_name[name_len - 1] == '\n') + algo_name[name_len - 1] = '\0'; + + bat_algo_ops = batadv_algo_get(algo_name); + if (!bat_algo_ops) { + pr_err("Routing algorithm '%s' is not supported\n", algo_name); + return -EINVAL; + } + + return param_set_copystring(algo_name, kp); +} + +static const struct kernel_param_ops batadv_param_ops_ra = { + .set = batadv_param_set_ra, + .get = param_get_string, +}; + +static struct kparam_string batadv_param_string_ra = { + .maxlen = sizeof(batadv_routing_algo), + .string = batadv_routing_algo, +}; + +module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, + 0644); diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 03dafd33d..860d773dd 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -18,32 +18,18 @@ #ifndef _NET_BATMAN_ADV_BAT_ALGO_H_ #define _NET_BATMAN_ADV_BAT_ALGO_H_ -struct batadv_priv; +#include "main.h" -int batadv_iv_init(void); +#include -#ifdef CONFIG_BATMAN_ADV_BATMAN_V +struct seq_file; -int batadv_v_init(void); -int batadv_v_mesh_init(struct batadv_priv *bat_priv); -void batadv_v_mesh_free(struct batadv_priv *bat_priv); +extern char batadv_routing_algo[]; +extern struct list_head batadv_hardif_list; -#else - -static inline int batadv_v_init(void) -{ - return 0; -} - -static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv) -{ - return 0; -} - -static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv) -{ -} - -#endif /* CONFIG_BATMAN_ADV_BATMAN_V */ +void batadv_algo_init(void); +int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); +int batadv_algo_select(struct batadv_priv *bat_priv, char *name); +int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index ce2f20304..19b0abd6c 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -15,7 +15,7 @@ * along with this program; if not, see . */ -#include "bat_algo.h" +#include "bat_iv_ogm.h" #include "main.h" #include @@ -30,8 +30,9 @@ #include #include #include -#include +#include #include +#include #include #include #include @@ -48,15 +49,20 @@ #include #include +#include "bat_algo.h" #include "bitarray.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "network-coding.h" #include "originator.h" #include "packet.h" #include "routing.h" #include "send.h" #include "translation-table.h" +#include "tvlv.h" + +static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work); /** * enum batadv_dup_status - duplicate status @@ -336,7 +342,8 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, { struct batadv_neigh_node *neigh_node; - neigh_node = batadv_neigh_node_new(orig_node, hard_iface, neigh_addr); + neigh_node = batadv_neigh_node_get_or_create(orig_node, + hard_iface, neigh_addr); if (!neigh_node) goto out; @@ -730,7 +737,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, /* start timer for this packet */ INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, - batadv_send_outstanding_bat_ogm_packet); + batadv_iv_send_outstanding_bat_ogm_packet); queue_delayed_work(batadv_event_workqueue, &forw_packet_aggr->delayed_work, send_time - jiffies); @@ -937,6 +944,19 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) u16 tvlv_len = 0; unsigned long send_time; + if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || + (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + return; + + /* the interface gets activated here to avoid race conditions between + * the moment of activating the interface in + * hardif_activate_interface() where the originator mac is set and + * outdated packets (especially uninitialized mac addresses) in the + * packet queue + */ + if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) + hard_iface->if_status = BATADV_IF_ACTIVE; + primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { @@ -1778,6 +1798,45 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, batadv_orig_node_put(orig_node); } +static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) +{ + struct delayed_work *delayed_work; + struct batadv_forw_packet *forw_packet; + struct batadv_priv *bat_priv; + + delayed_work = to_delayed_work(work); + forw_packet = container_of(delayed_work, struct batadv_forw_packet, + delayed_work); + bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); + spin_lock_bh(&bat_priv->forw_bat_list_lock); + hlist_del(&forw_packet->list); + spin_unlock_bh(&bat_priv->forw_bat_list_lock); + + if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) + goto out; + + batadv_iv_ogm_emit(forw_packet); + + /* we have to have at least one packet in the queue to determine the + * queues wake up time unless we are shutting down. + * + * only re-schedule if this is the "original" copy, e.g. the OGM of the + * primary interface should only be rescheduled once per period, but + * this function will be called for the forw_packet instances of the + * other secondary interfaces as well. + */ + if (forw_packet->own && + forw_packet->if_incoming == forw_packet->if_outgoing) + batadv_iv_ogm_schedule(forw_packet->if_incoming); + +out: + /* don't count own packet */ + if (!forw_packet->own) + atomic_inc(&bat_priv->batman_queue_left); + + batadv_forw_packet_free(forw_packet); +} + static int batadv_iv_ogm_receive(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { @@ -1794,7 +1853,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface * that does not have B.A.T.M.A.N. IV enabled ? */ - if (bat_priv->bat_algo_ops->bat_ogm_emit != batadv_iv_ogm_emit) + if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable) return NET_RX_DROP; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); @@ -2052,21 +2111,32 @@ out: return ret; } +static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) +{ + /* begin scheduling originator messages on that interface */ + batadv_iv_ogm_schedule(hard_iface); +} + static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", - .bat_iface_enable = batadv_iv_ogm_iface_enable, - .bat_iface_disable = batadv_iv_ogm_iface_disable, - .bat_iface_update_mac = batadv_iv_ogm_iface_update_mac, - .bat_primary_iface_set = batadv_iv_ogm_primary_iface_set, - .bat_ogm_schedule = batadv_iv_ogm_schedule, - .bat_ogm_emit = batadv_iv_ogm_emit, - .bat_neigh_cmp = batadv_iv_ogm_neigh_cmp, - .bat_neigh_is_similar_or_better = batadv_iv_ogm_neigh_is_sob, - .bat_neigh_print = batadv_iv_neigh_print, - .bat_orig_print = batadv_iv_ogm_orig_print, - .bat_orig_free = batadv_iv_ogm_orig_free, - .bat_orig_add_if = batadv_iv_ogm_orig_add_if, - .bat_orig_del_if = batadv_iv_ogm_orig_del_if, + .iface = { + .activate = batadv_iv_iface_activate, + .enable = batadv_iv_ogm_iface_enable, + .disable = batadv_iv_ogm_iface_disable, + .update_mac = batadv_iv_ogm_iface_update_mac, + .primary_set = batadv_iv_ogm_primary_iface_set, + }, + .neigh = { + .cmp = batadv_iv_ogm_neigh_cmp, + .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, + .print = batadv_iv_neigh_print, + }, + .orig = { + .print = batadv_iv_ogm_orig_print, + .free = batadv_iv_ogm_orig_free, + .add_if = batadv_iv_ogm_orig_add_if, + .del_if = batadv_iv_ogm_orig_del_if, + }, }; int __init batadv_iv_init(void) diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h new file mode 100644 index 000000000..b9f3550fa --- /dev/null +++ b/net/batman-adv/bat_iv_ogm.h @@ -0,0 +1,25 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_ +#define _BATMAN_ADV_BATADV_IV_OGM_H_ + +#include "main.h" + +int batadv_iv_init(void); + +#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 0a12e5cdd..0366cbf5e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -15,7 +15,7 @@ * along with this program; if not, see . */ -#include "bat_algo.h" +#include "bat_v.h" #include "main.h" #include @@ -31,6 +31,7 @@ #include #include +#include "bat_algo.h" #include "bat_v_elp.h" #include "bat_v_ogm.h" #include "hard-interface.h" @@ -70,11 +71,6 @@ static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) if (ret < 0) batadv_v_elp_iface_disable(hard_iface); - /* enable link throughput auto-detection by setting the throughput - * override to zero - */ - atomic_set(&hard_iface->bat_v.throughput_override, 0); - return ret; } @@ -119,14 +115,6 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) batadv_v_elp_throughput_metric_update); } -static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface) -{ -} - -static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet) -{ -} - /** * batadv_v_orig_print_neigh - print neighbors for the originator table * @orig_node: the orig_node for which the neighbors are printed @@ -334,20 +322,38 @@ err_ifinfo1: static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", - .bat_iface_activate = batadv_v_iface_activate, - .bat_iface_enable = batadv_v_iface_enable, - .bat_iface_disable = batadv_v_iface_disable, - .bat_iface_update_mac = batadv_v_iface_update_mac, - .bat_primary_iface_set = batadv_v_primary_iface_set, - .bat_hardif_neigh_init = batadv_v_hardif_neigh_init, - .bat_ogm_emit = batadv_v_ogm_emit, - .bat_ogm_schedule = batadv_v_ogm_schedule, - .bat_orig_print = batadv_v_orig_print, - .bat_neigh_cmp = batadv_v_neigh_cmp, - .bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob, - .bat_neigh_print = batadv_v_neigh_print, + .iface = { + .activate = batadv_v_iface_activate, + .enable = batadv_v_iface_enable, + .disable = batadv_v_iface_disable, + .update_mac = batadv_v_iface_update_mac, + .primary_set = batadv_v_primary_iface_set, + }, + .neigh = { + .hardif_init = batadv_v_hardif_neigh_init, + .cmp = batadv_v_neigh_cmp, + .is_similar_or_better = batadv_v_neigh_is_sob, + .print = batadv_v_neigh_print, + }, + .orig = { + .print = batadv_v_orig_print, + }, }; +/** + * batadv_v_hardif_init - initialize the algorithm specific fields in the + * hard-interface object + * @hard_iface: the hard-interface to initialize + */ +void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) +{ + /* enable link throughput auto-detection by setting the throughput + * override to zero + */ + atomic_set(&hard_iface->bat_v.throughput_override, 0); + atomic_set(&hard_iface->bat_v.elp_interval, 500); +} + /** * batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a * mesh diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h new file mode 100644 index 000000000..83b776397 --- /dev/null +++ b/net/batman-adv/bat_v.h @@ -0,0 +1,52 @@ +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Linus Lüssing + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_BAT_V_H_ +#define _NET_BATMAN_ADV_BAT_V_H_ + +#include "main.h" + +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + +int batadv_v_init(void); +void batadv_v_hardif_init(struct batadv_hard_iface *hardif); +int batadv_v_mesh_init(struct batadv_priv *bat_priv); +void batadv_v_mesh_free(struct batadv_priv *bat_priv); + +#else + +static inline int batadv_v_init(void) +{ + return 0; +} + +static inline void batadv_v_hardif_init(struct batadv_hard_iface *hardif) +{ +} + +static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv) +{ +} + +#endif /* CONFIG_BATMAN_ADV_BATMAN_V */ + +#endif /* _NET_BATMAN_ADV_BAT_V_H_ */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index df42eb136..ee08540ce 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -43,6 +43,7 @@ #include "bat_algo.h" #include "bat_v_ogm.h" #include "hard-interface.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "routing.h" @@ -334,7 +335,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) goto out; skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); - elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_buff = skb_put(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); elp_packet = (struct batadv_elp_packet *)elp_buff; memset(elp_packet, 0, BATADV_ELP_HLEN); @@ -344,7 +345,6 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno); - atomic_set(&hard_iface->bat_v.elp_interval, 500); /* assume full-duplex by default */ hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; @@ -443,7 +443,8 @@ static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv, if (!orig_neigh) return; - neigh = batadv_neigh_node_new(orig_neigh, if_incoming, neigh_addr); + neigh = batadv_neigh_node_get_or_create(orig_neigh, + if_incoming, neigh_addr); if (!neigh) goto orig_free; @@ -503,7 +504,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb, /* did we receive a B.A.T.M.A.N. V ELP packet on an interface * that does not have B.A.T.M.A.N. V ELP enabled ? */ - if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0) + if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) return NET_RX_DROP; elp_packet = (struct batadv_elp_packet *)skb->data; diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index cc130b2d0..be17c0b13 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -15,11 +15,11 @@ * along with this program; if not, see . */ -#include "main.h" - #ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_ #define _NET_BATMAN_ADV_BAT_V_ELP_H_ +#include "main.h" + struct sk_buff; struct work_struct; diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 473ebb9a0..6fbba4eb0 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -39,13 +39,16 @@ #include #include +#include "bat_algo.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "routing.h" #include "send.h" #include "translation-table.h" +#include "tvlv.h" /** * batadv_v_ogm_orig_get - retrieve and possibly create an originator node @@ -683,8 +686,8 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, if (!orig_node) return; - neigh_node = batadv_neigh_node_new(orig_node, if_incoming, - ethhdr->h_source); + neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming, + ethhdr->h_source); if (!neigh_node) goto out; @@ -751,7 +754,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb, /* did we receive a OGM2 packet on an interface that does not have * B.A.T.M.A.N. V enabled ? */ - if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0) + if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) return NET_RX_DROP; if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index d849c75ad..4c4d45caa 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -18,10 +18,10 @@ #ifndef _BATMAN_ADV_BATADV_V_OGM_H_ #define _BATMAN_ADV_BATADV_V_OGM_H_ +#include "main.h" + #include -struct batadv_hard_iface; -struct batadv_priv; struct sk_buff; int batadv_v_ogm_init(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index a0c791383..032271421 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -20,6 +20,8 @@ #include +#include "log.h" + /* shift the packet array by n places. */ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) { diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 825a5cdf4..ad2ffe16d 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -48,6 +48,7 @@ #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "sysfs.h" diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 952900466..1d68b6e63 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -18,245 +18,33 @@ #include "debugfs.h" #include "main.h" -#include #include #include #include #include -#include #include -#include -#include -#include #include -#include #include #include /* for linux/wait.h */ #include -#include -#include #include #include #include #include -#include -#include -#include -#include +#include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "icmp_socket.h" +#include "log.h" +#include "multicast.h" #include "network-coding.h" #include "originator.h" #include "translation-table.h" static struct dentry *batadv_debugfs; -#ifdef CONFIG_BATMAN_ADV_DEBUG -#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) - -static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; - -static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log, - size_t idx) -{ - return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK]; -} - -static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log, - char c) -{ - char *char_addr; - - char_addr = batadv_log_char_addr(debug_log, debug_log->log_end); - *char_addr = c; - debug_log->log_end++; - - if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len) - debug_log->log_start = debug_log->log_end - batadv_log_buff_len; -} - -__printf(2, 3) -static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, - const char *fmt, ...) -{ - va_list args; - static char debug_log_buf[256]; - char *p; - - if (!debug_log) - return 0; - - spin_lock_bh(&debug_log->lock); - va_start(args, fmt); - vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args); - va_end(args); - - for (p = debug_log_buf; *p != 0; p++) - batadv_emit_log_char(debug_log, *p); - - spin_unlock_bh(&debug_log->lock); - - wake_up(&debug_log->queue_wait); - - return 0; -} - -int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) -{ - va_list args; - char tmp_log_buf[256]; - - va_start(args, fmt); - vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args); - batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s", - jiffies_to_msecs(jiffies), tmp_log_buf); - va_end(args); - - return 0; -} - -static int batadv_log_open(struct inode *inode, struct file *file) -{ - if (!try_module_get(THIS_MODULE)) - return -EBUSY; - - nonseekable_open(inode, file); - file->private_data = inode->i_private; - return 0; -} - -static int batadv_log_release(struct inode *inode, struct file *file) -{ - module_put(THIS_MODULE); - return 0; -} - -static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log) -{ - return !(debug_log->log_start - debug_log->log_end); -} - -static ssize_t batadv_log_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct batadv_priv *bat_priv = file->private_data; - struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; - int error, i = 0; - char *char_addr; - char c; - - if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log)) - return -EAGAIN; - - if (!buf) - return -EINVAL; - - if (count == 0) - return 0; - - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - - error = wait_event_interruptible(debug_log->queue_wait, - (!batadv_log_empty(debug_log))); - - if (error) - return error; - - spin_lock_bh(&debug_log->lock); - - while ((!error) && (i < count) && - (debug_log->log_start != debug_log->log_end)) { - char_addr = batadv_log_char_addr(debug_log, - debug_log->log_start); - c = *char_addr; - - debug_log->log_start++; - - spin_unlock_bh(&debug_log->lock); - - error = __put_user(c, buf); - - spin_lock_bh(&debug_log->lock); - - buf++; - i++; - } - - spin_unlock_bh(&debug_log->lock); - - if (!error) - return i; - - return error; -} - -static unsigned int batadv_log_poll(struct file *file, poll_table *wait) -{ - struct batadv_priv *bat_priv = file->private_data; - struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; - - poll_wait(file, &debug_log->queue_wait, wait); - - if (!batadv_log_empty(debug_log)) - return POLLIN | POLLRDNORM; - - return 0; -} - -static const struct file_operations batadv_log_fops = { - .open = batadv_log_open, - .release = batadv_log_release, - .read = batadv_log_read, - .poll = batadv_log_poll, - .llseek = no_llseek, -}; - -static int batadv_debug_log_setup(struct batadv_priv *bat_priv) -{ - struct dentry *d; - - if (!bat_priv->debug_dir) - goto err; - - bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC); - if (!bat_priv->debug_log) - goto err; - - spin_lock_init(&bat_priv->debug_log->lock); - init_waitqueue_head(&bat_priv->debug_log->queue_wait); - - d = debugfs_create_file("log", S_IFREG | S_IRUSR, - bat_priv->debug_dir, bat_priv, - &batadv_log_fops); - if (!d) - goto err; - - return 0; - -err: - return -ENOMEM; -} - -static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) -{ - kfree(bat_priv->debug_log); - bat_priv->debug_log = NULL; -} -#else /* CONFIG_BATMAN_ADV_DEBUG */ -static int batadv_debug_log_setup(struct batadv_priv *bat_priv) -{ - return 0; -} - -static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) -{ -} -#endif - static int batadv_algorithms_open(struct inode *inode, struct file *file) { return single_open(file, batadv_algo_seq_print_text, NULL); @@ -363,6 +151,22 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file) } #endif +#ifdef CONFIG_BATMAN_ADV_MCAST +/** + * batadv_mcast_flags_open - prepare file handler for reads from mcast_flags + * @inode: inode which was opened + * @file: file handle to be initialized + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_mcast_flags_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + + return single_open(file, batadv_mcast_flags_seq_print_text, net_dev); +} +#endif + #define BATADV_DEBUGINFO(_name, _mode, _open) \ struct batadv_debuginfo batadv_debuginfo_##_name = { \ .attr = { \ @@ -407,6 +211,9 @@ static BATADV_DEBUGINFO(transtable_local, S_IRUGO, #ifdef CONFIG_BATMAN_ADV_NC static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open); #endif +#ifdef CONFIG_BATMAN_ADV_MCAST +static BATADV_DEBUGINFO(mcast_flags, S_IRUGO, batadv_mcast_flags_open); +#endif static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { &batadv_debuginfo_neighbors, @@ -423,6 +230,9 @@ static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { &batadv_debuginfo_transtable_local, #ifdef CONFIG_BATMAN_ADV_NC &batadv_debuginfo_nc_nodes, +#endif +#ifdef CONFIG_BATMAN_ADV_MCAST + &batadv_debuginfo_mcast_flags, #endif NULL, }; diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index aee3b3991..b1cc8bfe1 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -45,9 +45,11 @@ #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "originator.h" #include "send.h" #include "translation-table.h" +#include "tvlv.h" static void batadv_dat_purge(struct work_struct *work); diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 65536db1b..0934730fb 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -414,7 +413,7 @@ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, if (!skb_fragment) goto err; - skb->priority = TC_PRIO_CONTROL; + skb_fragment->priority = skb->priority; /* Eat the last mtu-bytes of the skb */ skb_reserve(skb_fragment, header_size + ETH_HLEN); @@ -434,11 +433,12 @@ err: * @orig_node: final destination of the created fragments * @neigh_node: next-hop of the created fragments * - * Return: true on success, false otherwise. + * Return: the netdev tx status or -1 in case of error. + * When -1 is returned the skb is not consumed. */ -bool batadv_frag_send_packet(struct sk_buff *skb, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node) +int batadv_frag_send_packet(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node) { struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; @@ -447,7 +447,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; unsigned int header_size = sizeof(frag_header); unsigned int max_fragment_size, max_packet_size; - bool ret = false; + int ret = -1; /* To avoid merge and refragmentation at next-hops we never send * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE @@ -458,12 +458,12 @@ bool batadv_frag_send_packet(struct sk_buff *skb, /* Don't even try to fragment, if we need more than 16 fragments */ if (skb->len > max_packet_size) - goto out_err; + goto out; bat_priv = orig_node->bat_priv; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) - goto out_err; + goto out; /* Create one header to be copied to all fragments */ frag_header.packet_type = BATADV_UNICAST_FRAG; @@ -473,6 +473,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb, frag_header.reserved = 0; frag_header.no = 0; frag_header.total_size = htons(skb->len); + + /* skb->priority values from 256->263 are magic values to + * directly indicate a specific 802.1d priority. This is used + * to allow 802.1d priority to be passed directly in from VLAN + * tags, etc. + */ + if (skb->priority >= 256 && skb->priority <= 263) + frag_header.priority = skb->priority - 256; + ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr); ether_addr_copy(frag_header.dest, orig_node->orig); @@ -480,23 +489,33 @@ bool batadv_frag_send_packet(struct sk_buff *skb, while (skb->len > max_fragment_size) { skb_fragment = batadv_frag_create(skb, &frag_header, mtu); if (!skb_fragment) - goto out_err; + goto out; batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, skb_fragment->len + ETH_HLEN); - batadv_send_unicast_skb(skb_fragment, neigh_node); + ret = batadv_send_unicast_skb(skb_fragment, neigh_node); + if (ret != NET_XMIT_SUCCESS) { + /* return -1 so that the caller can free the original + * skb + */ + ret = -1; + goto out; + } + frag_header.no++; /* The initial check in this function should cover this case */ - if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) - goto out_err; + if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) { + ret = -1; + goto out; + } } /* Make room for the fragment header. */ if (batadv_skb_head_push(skb, header_size) < 0 || pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) - goto out_err; + goto out; memcpy(skb->data, &frag_header, header_size); @@ -504,11 +523,9 @@ bool batadv_frag_send_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, skb->len + ETH_HLEN); - batadv_send_unicast_skb(skb, neigh_node); - - ret = true; + ret = batadv_send_unicast_skb(skb, neigh_node); -out_err: +out: if (primary_if) batadv_hardif_put(primary_if); diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 9ff77c7ef..3202fe329 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -34,9 +34,9 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_orig_node *orig_node_src); bool batadv_frag_skb_buffer(struct sk_buff **skb, struct batadv_orig_node *orig_node); -bool batadv_frag_send_packet(struct sk_buff *skb, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node); +int batadv_frag_send_packet(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node); /** * batadv_frag_check_entry - check if a list of fragments has timed out diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 5839c569f..63a805d3f 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -42,6 +42,7 @@ #include "gateway_common.h" #include "hard-interface.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "routing.h" @@ -192,7 +193,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) tq_avg = router_ifinfo->bat_iv.tq_avg; - switch (atomic_read(&bat_priv->gw_sel_class)) { + switch (atomic_read(&bat_priv->gw.sel_class)) { case 1: /* fast connection */ tmp_gw_factor = tq_avg * tq_avg; tmp_gw_factor *= gw_node->bandwidth_down; @@ -255,7 +256,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw; - if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) + if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) return; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); @@ -283,7 +284,7 @@ void batadv_gw_election(struct batadv_priv *bat_priv) struct batadv_neigh_ifinfo *router_ifinfo = NULL; char gw_addr[18] = { '\0' }; - if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) + if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); @@ -402,8 +403,8 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, /* if the routing class is greater than 3 the value tells us how much * greater the TQ value of the new gateway must be */ - if ((atomic_read(&bat_priv->gw_sel_class) > 3) && - (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw_sel_class))) + if ((atomic_read(&bat_priv->gw.sel_class) > 3) && + (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -638,8 +639,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) goto out; seq_printf(seq, - " %-12s (%s/%i) %17s [%10s]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n", - "Gateway", "#", BATADV_TQ_MAX_VALUE, "Nexthop", "outgoingIF", + " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n", BATADV_SOURCE_VERSION, primary_if->net_dev->name, primary_if->net_dev->dev_addr, net_dev->name); @@ -821,7 +821,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if (!gw_node) goto out; - switch (atomic_read(&bat_priv->gw_mode)) { + switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_SERVER: /* If we are a GW then we are our best GW. We can artificially * set the tq towards ourself as the maximum value diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 442304788..d7bc6a87b 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -19,8 +19,8 @@ #include "main.h" #include -#include #include +#include #include #include #include @@ -28,7 +28,9 @@ #include #include "gateway_client.h" +#include "log.h" #include "packet.h" +#include "tvlv.h" /** * batadv_parse_throughput - parse supplied string buffer to extract throughput @@ -144,7 +146,7 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) u32 down, up; char gw_mode; - gw_mode = atomic_read(&bat_priv->gw_mode); + gw_mode = atomic_read(&bat_priv->gw.mode); switch (gw_mode) { case BATADV_GW_MODE_OFF: @@ -241,8 +243,8 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /* restart gateway selection if fast or late switching was enabled */ if ((gateway.bandwidth_down != 0) && - (atomic_read(&bat_priv->gw_mode) == BATADV_GW_MODE_CLIENT) && - (atomic_read(&bat_priv->gw_sel_class) > 2)) + (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) && + (atomic_read(&bat_priv->gw.sel_class) > 2)) batadv_gw_check_election(bat_priv, orig); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 8c2f39962..1f9080840 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -23,9 +23,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -37,10 +37,12 @@ #include #include +#include "bat_v.h" #include "bridge_loop_avoidance.h" #include "debugfs.h" #include "distributed-arp-table.h" #include "gateway_client.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "send.h" @@ -245,7 +247,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, if (!new_hard_iface) goto out; - bat_priv->bat_algo_ops->bat_primary_iface_set(new_hard_iface); + bat_priv->algo_ops->iface.primary_set(new_hard_iface); batadv_primary_if_update_addr(bat_priv, curr_hard_iface); out: @@ -392,7 +394,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) bat_priv = netdev_priv(hard_iface->soft_iface); - bat_priv->bat_algo_ops->bat_iface_update_mac(hard_iface); + bat_priv->algo_ops->iface.update_mac(hard_iface); hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED; /* the first active interface becomes our primary interface or @@ -407,8 +409,8 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) batadv_update_min_mtu(hard_iface->soft_iface); - if (bat_priv->bat_algo_ops->bat_iface_activate) - bat_priv->bat_algo_ops->bat_iface_activate(hard_iface); + if (bat_priv->algo_ops->iface.activate) + bat_priv->algo_ops->iface.activate(hard_iface); out: if (primary_if) @@ -506,7 +508,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (ret) goto err_dev; - ret = bat_priv->bat_algo_ops->bat_iface_enable(hard_iface); + ret = bat_priv->algo_ops->iface.enable(hard_iface); if (ret < 0) goto err_upper; @@ -515,7 +517,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->if_status = BATADV_IF_INACTIVE; ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces); if (ret < 0) { - bat_priv->bat_algo_ops->bat_iface_disable(hard_iface); + bat_priv->algo_ops->iface.disable(hard_iface); bat_priv->num_ifaces--; hard_iface->if_status = BATADV_IF_NOT_IN_USE; goto err_upper; @@ -553,9 +555,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_recalc_extra_skbroom(soft_iface); - /* begin scheduling originator messages on that interface */ - batadv_schedule_bat_ogm(hard_iface); - out: return 0; @@ -599,7 +598,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_put(new_if); } - bat_priv->bat_algo_ops->bat_iface_disable(hard_iface); + bat_priv->algo_ops->iface.disable(hard_iface); hard_iface->if_status = BATADV_IF_NOT_IN_USE; /* delete all references to this hard_iface */ @@ -686,6 +685,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (batadv_is_wifi_netdev(net_dev)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; + batadv_v_hardif_init(hard_iface); + /* extra reference for return */ kref_init(&hard_iface->refcount); kref_get(&hard_iface->refcount); @@ -782,7 +783,7 @@ static int batadv_hard_if_event(struct notifier_block *this, batadv_check_known_mac_addr(hard_iface->net_dev); bat_priv = netdev_priv(hard_iface->soft_iface); - bat_priv->bat_algo_ops->bat_iface_update_mac(hard_iface); + bat_priv->algo_ops->iface.update_mac(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 777aea10c..378cc1119 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -45,6 +45,7 @@ #include #include "hard-interface.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "send.h" diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c new file mode 100644 index 000000000..56dc532f7 --- /dev/null +++ b/net/batman-adv/log.c @@ -0,0 +1,231 @@ +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "log.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux/wait.h */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) + +static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; + +static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log, + size_t idx) +{ + return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK]; +} + +static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log, + char c) +{ + char *char_addr; + + char_addr = batadv_log_char_addr(debug_log, debug_log->log_end); + *char_addr = c; + debug_log->log_end++; + + if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len) + debug_log->log_start = debug_log->log_end - batadv_log_buff_len; +} + +__printf(2, 3) +static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, + const char *fmt, ...) +{ + va_list args; + static char debug_log_buf[256]; + char *p; + + if (!debug_log) + return 0; + + spin_lock_bh(&debug_log->lock); + va_start(args, fmt); + vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args); + va_end(args); + + for (p = debug_log_buf; *p != 0; p++) + batadv_emit_log_char(debug_log, *p); + + spin_unlock_bh(&debug_log->lock); + + wake_up(&debug_log->queue_wait); + + return 0; +} + +int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) +{ + va_list args; + char tmp_log_buf[256]; + + va_start(args, fmt); + vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args); + batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s", + jiffies_to_msecs(jiffies), tmp_log_buf); + va_end(args); + + return 0; +} + +static int batadv_log_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + nonseekable_open(inode, file); + file->private_data = inode->i_private; + return 0; +} + +static int batadv_log_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + +static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log) +{ + return !(debug_log->log_start - debug_log->log_end); +} + +static ssize_t batadv_log_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct batadv_priv *bat_priv = file->private_data; + struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; + int error, i = 0; + char *char_addr; + char c; + + if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log)) + return -EAGAIN; + + if (!buf) + return -EINVAL; + + if (count == 0) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + error = wait_event_interruptible(debug_log->queue_wait, + (!batadv_log_empty(debug_log))); + + if (error) + return error; + + spin_lock_bh(&debug_log->lock); + + while ((!error) && (i < count) && + (debug_log->log_start != debug_log->log_end)) { + char_addr = batadv_log_char_addr(debug_log, + debug_log->log_start); + c = *char_addr; + + debug_log->log_start++; + + spin_unlock_bh(&debug_log->lock); + + error = __put_user(c, buf); + + spin_lock_bh(&debug_log->lock); + + buf++; + i++; + } + + spin_unlock_bh(&debug_log->lock); + + if (!error) + return i; + + return error; +} + +static unsigned int batadv_log_poll(struct file *file, poll_table *wait) +{ + struct batadv_priv *bat_priv = file->private_data; + struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; + + poll_wait(file, &debug_log->queue_wait, wait); + + if (!batadv_log_empty(debug_log)) + return POLLIN | POLLRDNORM; + + return 0; +} + +static const struct file_operations batadv_log_fops = { + .open = batadv_log_open, + .release = batadv_log_release, + .read = batadv_log_read, + .poll = batadv_log_poll, + .llseek = no_llseek, +}; + +int batadv_debug_log_setup(struct batadv_priv *bat_priv) +{ + struct dentry *d; + + if (!bat_priv->debug_dir) + goto err; + + bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC); + if (!bat_priv->debug_log) + goto err; + + spin_lock_init(&bat_priv->debug_log->lock); + init_waitqueue_head(&bat_priv->debug_log->queue_wait); + + d = debugfs_create_file("log", S_IFREG | S_IRUSR, + bat_priv->debug_dir, bat_priv, + &batadv_log_fops); + if (!d) + goto err; + + return 0; + +err: + return -ENOMEM; +} + +void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) +{ + kfree(bat_priv->debug_log); + bat_priv->debug_log = NULL; +} diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h new file mode 100644 index 000000000..e0e1a88c3 --- /dev/null +++ b/net/batman-adv/log.h @@ -0,0 +1,111 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_LOG_H_ +#define _NET_BATMAN_ADV_LOG_H_ + +#include "main.h" + +#include +#include +#include + +#ifdef CONFIG_BATMAN_ADV_DEBUG + +int batadv_debug_log_setup(struct batadv_priv *bat_priv); +void batadv_debug_log_cleanup(struct batadv_priv *bat_priv); + +#else + +static inline int batadv_debug_log_setup(struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) +{ +} + +#endif + +/** + * enum batadv_dbg_level - available log levels + * @BATADV_DBG_BATMAN: OGM and TQ computations related messages + * @BATADV_DBG_ROUTES: route added / changed / deleted + * @BATADV_DBG_TT: translation table messages + * @BATADV_DBG_BLA: bridge loop avoidance messages + * @BATADV_DBG_DAT: ARP snooping and DAT related messages + * @BATADV_DBG_NC: network coding related messages + * @BATADV_DBG_MCAST: multicast related messages + * @BATADV_DBG_TP_METER: throughput meter messages + * @BATADV_DBG_ALL: the union of all the above log levels + */ +enum batadv_dbg_level { + BATADV_DBG_BATMAN = BIT(0), + BATADV_DBG_ROUTES = BIT(1), + BATADV_DBG_TT = BIT(2), + BATADV_DBG_BLA = BIT(3), + BATADV_DBG_DAT = BIT(4), + BATADV_DBG_NC = BIT(5), + BATADV_DBG_MCAST = BIT(6), + BATADV_DBG_TP_METER = BIT(7), + BATADV_DBG_ALL = 127, +}; + +#ifdef CONFIG_BATMAN_ADV_DEBUG +int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) +__printf(2, 3); + +/* possibly ratelimited debug output */ +#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ + do { \ + if (atomic_read(&bat_priv->log_level) & type && \ + (!ratelimited || net_ratelimit())) \ + batadv_debug_log(bat_priv, fmt, ## arg);\ + } \ + while (0) +#else /* !CONFIG_BATMAN_ADV_DEBUG */ +__printf(4, 5) +static inline void _batadv_dbg(int type __always_unused, + struct batadv_priv *bat_priv __always_unused, + int ratelimited __always_unused, + const char *fmt __always_unused, ...) +{ +} +#endif + +#define batadv_dbg(type, bat_priv, arg...) \ + _batadv_dbg(type, bat_priv, 0, ## arg) +#define batadv_dbg_ratelimited(type, bat_priv, arg...) \ + _batadv_dbg(type, bat_priv, 1, ## arg) + +#define batadv_info(net_dev, fmt, arg...) \ + do { \ + struct net_device *_netdev = (net_dev); \ + struct batadv_priv *_batpriv = netdev_priv(_netdev); \ + batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ + pr_info("%s: " fmt, _netdev->name, ## arg); \ + } while (0) +#define batadv_err(net_dev, fmt, arg...) \ + do { \ + struct net_device *_netdev = (net_dev); \ + struct batadv_priv *_batpriv = netdev_priv(_netdev); \ + batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ + pr_err("%s: " fmt, _netdev->name, ## arg); \ + } while (0) + +#endif /* _NET_BATMAN_ADV_LOG_H_ */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 5f2974bd1..fe4c5e29f 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -31,16 +31,13 @@ #include #include #include -#include #include -#include #include -#include +#include #include #include #include #include -#include #include #include #include @@ -49,6 +46,8 @@ #include #include "bat_algo.h" +#include "bat_iv_ogm.h" +#include "bat_v.h" #include "bridge_loop_avoidance.h" #include "debugfs.h" #include "distributed-arp-table.h" @@ -56,13 +55,16 @@ #include "gateway_common.h" #include "hard-interface.h" #include "icmp_socket.h" +#include "log.h" #include "multicast.h" +#include "netlink.h" #include "network-coding.h" #include "originator.h" #include "packet.h" #include "routing.h" #include "send.h" #include "soft-interface.h" +#include "tp_meter.h" #include "translation-table.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, @@ -71,8 +73,6 @@ struct list_head batadv_hardif_list; static int (*batadv_rx_handler[256])(struct sk_buff *, struct batadv_hard_iface *); -char batadv_routing_algo[20] = "BATMAN_IV"; -static struct hlist_head batadv_algo_list; unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; @@ -83,13 +83,14 @@ static void batadv_recv_handler_init(void); static int __init batadv_init(void) { INIT_LIST_HEAD(&batadv_hardif_list); - INIT_HLIST_HEAD(&batadv_algo_list); + batadv_algo_init(); batadv_recv_handler_init(); batadv_v_init(); batadv_iv_init(); batadv_nc_init(); + batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); @@ -101,6 +102,7 @@ static int __init batadv_init(void) register_netdevice_notifier(&batadv_hard_if_notifier); rtnl_link_register(&batadv_link_ops); + batadv_netlink_register(); pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n", BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); @@ -111,6 +113,7 @@ static int __init batadv_init(void) static void __exit batadv_exit(void) { batadv_debugfs_destroy(); + batadv_netlink_unregister(); rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); batadv_hardif_remove_interfaces(); @@ -141,6 +144,7 @@ int batadv_mesh_init(struct net_device *soft_iface) spin_lock_init(&bat_priv->tvlv.container_list_lock); spin_lock_init(&bat_priv->tvlv.handler_list_lock); spin_lock_init(&bat_priv->softif_vlan_list_lock); + spin_lock_init(&bat_priv->tp_list_lock); INIT_HLIST_HEAD(&bat_priv->forw_bat_list); INIT_HLIST_HEAD(&bat_priv->forw_bcast_list); @@ -159,6 +163,7 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->tvlv.container_list); INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); + INIT_HLIST_HEAD(&bat_priv->tp_list); ret = batadv_v_mesh_init(bat_priv); if (ret < 0) @@ -538,78 +543,6 @@ void batadv_recv_handler_unregister(u8 packet_type) batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } -static struct batadv_algo_ops *batadv_algo_get(char *name) -{ - struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; - - hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { - if (strcmp(bat_algo_ops_tmp->name, name) != 0) - continue; - - bat_algo_ops = bat_algo_ops_tmp; - break; - } - - return bat_algo_ops; -} - -int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) -{ - struct batadv_algo_ops *bat_algo_ops_tmp; - - bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); - if (bat_algo_ops_tmp) { - pr_info("Trying to register already registered routing algorithm: %s\n", - bat_algo_ops->name); - return -EEXIST; - } - - /* all algorithms must implement all ops (for now) */ - if (!bat_algo_ops->bat_iface_enable || - !bat_algo_ops->bat_iface_disable || - !bat_algo_ops->bat_iface_update_mac || - !bat_algo_ops->bat_primary_iface_set || - !bat_algo_ops->bat_ogm_schedule || - !bat_algo_ops->bat_ogm_emit || - !bat_algo_ops->bat_neigh_cmp || - !bat_algo_ops->bat_neigh_is_similar_or_better) { - pr_info("Routing algo '%s' does not implement required ops\n", - bat_algo_ops->name); - return -EINVAL; - } - - INIT_HLIST_NODE(&bat_algo_ops->list); - hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); - - return 0; -} - -int batadv_algo_select(struct batadv_priv *bat_priv, char *name) -{ - struct batadv_algo_ops *bat_algo_ops; - - bat_algo_ops = batadv_algo_get(name); - if (!bat_algo_ops) - return -EINVAL; - - bat_priv->bat_algo_ops = bat_algo_ops; - - return 0; -} - -int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) -{ - struct batadv_algo_ops *bat_algo_ops; - - seq_puts(seq, "Available routing algorithms:\n"); - - hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { - seq_printf(seq, " * %s\n", bat_algo_ops->name); - } - - return 0; -} - /** * batadv_skb_crc32 - calculate CRC32 of the whole packet and skip bytes in * the header @@ -643,594 +576,6 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) return htonl(crc); } -/** - * batadv_tvlv_handler_release - release tvlv handler from lists and queue for - * free after rcu grace period - * @ref: kref pointer of the tvlv - */ -static void batadv_tvlv_handler_release(struct kref *ref) -{ - struct batadv_tvlv_handler *tvlv_handler; - - tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); - kfree_rcu(tvlv_handler, rcu); -} - -/** - * batadv_tvlv_handler_put - decrement the tvlv container refcounter and - * possibly release it - * @tvlv_handler: the tvlv handler to free - */ -static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) -{ - kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); -} - -/** - * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list - * based on the provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv handler type to look for - * @version: tvlv handler version to look for - * - * Return: tvlv handler if found or NULL otherwise. - */ -static struct batadv_tvlv_handler * -batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) -{ - struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tvlv_handler_tmp, - &bat_priv->tvlv.handler_list, list) { - if (tvlv_handler_tmp->type != type) - continue; - - if (tvlv_handler_tmp->version != version) - continue; - - if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) - continue; - - tvlv_handler = tvlv_handler_tmp; - break; - } - rcu_read_unlock(); - - return tvlv_handler; -} - -/** - * batadv_tvlv_container_release - release tvlv from lists and free - * @ref: kref pointer of the tvlv - */ -static void batadv_tvlv_container_release(struct kref *ref) -{ - struct batadv_tvlv_container *tvlv; - - tvlv = container_of(ref, struct batadv_tvlv_container, refcount); - kfree(tvlv); -} - -/** - * batadv_tvlv_container_put - decrement the tvlv container refcounter and - * possibly release it - * @tvlv: the tvlv container to free - */ -static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) -{ - kref_put(&tvlv->refcount, batadv_tvlv_container_release); -} - -/** - * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container - * list based on the provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv container type to look for - * @version: tvlv container version to look for - * - * Has to be called with the appropriate locks being acquired - * (tvlv.container_list_lock). - * - * Return: tvlv container if found or NULL otherwise. - */ -static struct batadv_tvlv_container * -batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) -{ - struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; - - lockdep_assert_held(&bat_priv->tvlv.container_list_lock); - - hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { - if (tvlv_tmp->tvlv_hdr.type != type) - continue; - - if (tvlv_tmp->tvlv_hdr.version != version) - continue; - - kref_get(&tvlv_tmp->refcount); - tvlv = tvlv_tmp; - break; - } - - return tvlv; -} - -/** - * batadv_tvlv_container_list_size - calculate the size of the tvlv container - * list entries - * @bat_priv: the bat priv with all the soft interface information - * - * Has to be called with the appropriate locks being acquired - * (tvlv.container_list_lock). - * - * Return: size of all currently registered tvlv containers in bytes. - */ -static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) -{ - struct batadv_tvlv_container *tvlv; - u16 tvlv_len = 0; - - lockdep_assert_held(&bat_priv->tvlv.container_list_lock); - - hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { - tvlv_len += sizeof(struct batadv_tvlv_hdr); - tvlv_len += ntohs(tvlv->tvlv_hdr.len); - } - - return tvlv_len; -} - -/** - * batadv_tvlv_container_remove - remove tvlv container from the tvlv container - * list - * @bat_priv: the bat priv with all the soft interface information - * @tvlv: the to be removed tvlv container - * - * Has to be called with the appropriate locks being acquired - * (tvlv.container_list_lock). - */ -static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, - struct batadv_tvlv_container *tvlv) -{ - lockdep_assert_held(&bat_priv->tvlv.container_list_lock); - - if (!tvlv) - return; - - hlist_del(&tvlv->list); - - /* first call to decrement the counter, second call to free */ - batadv_tvlv_container_put(tvlv); - batadv_tvlv_container_put(tvlv); -} - -/** - * batadv_tvlv_container_unregister - unregister tvlv container based on the - * provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv container type to unregister - * @version: tvlv container type to unregister - */ -void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, - u8 type, u8 version) -{ - struct batadv_tvlv_container *tvlv; - - spin_lock_bh(&bat_priv->tvlv.container_list_lock); - tvlv = batadv_tvlv_container_get(bat_priv, type, version); - batadv_tvlv_container_remove(bat_priv, tvlv); - spin_unlock_bh(&bat_priv->tvlv.container_list_lock); -} - -/** - * batadv_tvlv_container_register - register tvlv type, version and content - * to be propagated with each (primary interface) OGM - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv container type - * @version: tvlv container version - * @tvlv_value: tvlv container content - * @tvlv_value_len: tvlv container content length - * - * If a container of the same type and version was already registered the new - * content is going to replace the old one. - */ -void batadv_tvlv_container_register(struct batadv_priv *bat_priv, - u8 type, u8 version, - void *tvlv_value, u16 tvlv_value_len) -{ - struct batadv_tvlv_container *tvlv_old, *tvlv_new; - - if (!tvlv_value) - tvlv_value_len = 0; - - tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC); - if (!tvlv_new) - return; - - tvlv_new->tvlv_hdr.version = version; - tvlv_new->tvlv_hdr.type = type; - tvlv_new->tvlv_hdr.len = htons(tvlv_value_len); - - memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); - INIT_HLIST_NODE(&tvlv_new->list); - kref_init(&tvlv_new->refcount); - - spin_lock_bh(&bat_priv->tvlv.container_list_lock); - tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); - batadv_tvlv_container_remove(bat_priv, tvlv_old); - hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); - spin_unlock_bh(&bat_priv->tvlv.container_list_lock); -} - -/** - * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate - * requested packet size - * @packet_buff: packet buffer - * @packet_buff_len: packet buffer size - * @min_packet_len: requested packet minimum size - * @additional_packet_len: requested additional packet size on top of minimum - * size - * - * Return: true of the packet buffer could be changed to the requested size, - * false otherwise. - */ -static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, - int *packet_buff_len, - int min_packet_len, - int additional_packet_len) -{ - unsigned char *new_buff; - - new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); - - /* keep old buffer if kmalloc should fail */ - if (!new_buff) - return false; - - memcpy(new_buff, *packet_buff, min_packet_len); - kfree(*packet_buff); - *packet_buff = new_buff; - *packet_buff_len = min_packet_len + additional_packet_len; - - return true; -} - -/** - * batadv_tvlv_container_ogm_append - append tvlv container content to given - * OGM packet buffer - * @bat_priv: the bat priv with all the soft interface information - * @packet_buff: ogm packet buffer - * @packet_buff_len: ogm packet buffer size including ogm header and tvlv - * content - * @packet_min_len: ogm header size to be preserved for the OGM itself - * - * The ogm packet might be enlarged or shrunk depending on the current size - * and the size of the to-be-appended tvlv containers. - * - * Return: size of all appended tvlv containers in bytes. - */ -u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, - unsigned char **packet_buff, - int *packet_buff_len, int packet_min_len) -{ - struct batadv_tvlv_container *tvlv; - struct batadv_tvlv_hdr *tvlv_hdr; - u16 tvlv_value_len; - void *tvlv_value; - bool ret; - - spin_lock_bh(&bat_priv->tvlv.container_list_lock); - tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); - - ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, - packet_min_len, tvlv_value_len); - - if (!ret) - goto end; - - if (!tvlv_value_len) - goto end; - - tvlv_value = (*packet_buff) + packet_min_len; - - hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { - tvlv_hdr = tvlv_value; - tvlv_hdr->type = tvlv->tvlv_hdr.type; - tvlv_hdr->version = tvlv->tvlv_hdr.version; - tvlv_hdr->len = tvlv->tvlv_hdr.len; - tvlv_value = tvlv_hdr + 1; - memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); - tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); - } - -end: - spin_unlock_bh(&bat_priv->tvlv.container_list_lock); - return tvlv_value_len; -} - -/** - * batadv_tvlv_call_handler - parse the given tvlv buffer to call the - * appropriate handlers - * @bat_priv: the bat priv with all the soft interface information - * @tvlv_handler: tvlv callback function handling the tvlv content - * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet - * @orig_node: orig node emitting the ogm packet - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet - * @tvlv_value: tvlv content - * @tvlv_value_len: tvlv content length - * - * Return: success if handler was not found or the return value of the handler - * callback. - */ -static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, - struct batadv_tvlv_handler *tvlv_handler, - bool ogm_source, - struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_value, u16 tvlv_value_len) -{ - if (!tvlv_handler) - return NET_RX_SUCCESS; - - if (ogm_source) { - if (!tvlv_handler->ogm_handler) - return NET_RX_SUCCESS; - - if (!orig_node) - return NET_RX_SUCCESS; - - tvlv_handler->ogm_handler(bat_priv, orig_node, - BATADV_NO_FLAGS, - tvlv_value, tvlv_value_len); - tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; - } else { - if (!src) - return NET_RX_SUCCESS; - - if (!dst) - return NET_RX_SUCCESS; - - if (!tvlv_handler->unicast_handler) - return NET_RX_SUCCESS; - - return tvlv_handler->unicast_handler(bat_priv, src, - dst, tvlv_value, - tvlv_value_len); - } - - return NET_RX_SUCCESS; -} - -/** - * batadv_tvlv_containers_process - parse the given tvlv buffer to call the - * appropriate handlers - * @bat_priv: the bat priv with all the soft interface information - * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet - * @orig_node: orig node emitting the ogm packet - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet - * @tvlv_value: tvlv content - * @tvlv_value_len: tvlv content length - * - * Return: success when processing an OGM or the return value of all called - * handler callbacks. - */ -int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, - bool ogm_source, - struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_value, u16 tvlv_value_len) -{ - struct batadv_tvlv_handler *tvlv_handler; - struct batadv_tvlv_hdr *tvlv_hdr; - u16 tvlv_value_cont_len; - u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; - int ret = NET_RX_SUCCESS; - - while (tvlv_value_len >= sizeof(*tvlv_hdr)) { - tvlv_hdr = tvlv_value; - tvlv_value_cont_len = ntohs(tvlv_hdr->len); - tvlv_value = tvlv_hdr + 1; - tvlv_value_len -= sizeof(*tvlv_hdr); - - if (tvlv_value_cont_len > tvlv_value_len) - break; - - tvlv_handler = batadv_tvlv_handler_get(bat_priv, - tvlv_hdr->type, - tvlv_hdr->version); - - ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, - ogm_source, orig_node, - src, dst, tvlv_value, - tvlv_value_cont_len); - if (tvlv_handler) - batadv_tvlv_handler_put(tvlv_handler); - tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; - tvlv_value_len -= tvlv_value_cont_len; - } - - if (!ogm_source) - return ret; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tvlv_handler, - &bat_priv->tvlv.handler_list, list) { - if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && - !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) - tvlv_handler->ogm_handler(bat_priv, orig_node, - cifnotfound, NULL, 0); - - tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED; - } - rcu_read_unlock(); - - return NET_RX_SUCCESS; -} - -/** - * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate - * handlers - * @bat_priv: the bat priv with all the soft interface information - * @batadv_ogm_packet: ogm packet containing the tvlv containers - * @orig_node: orig node emitting the ogm packet - */ -void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, - struct batadv_ogm_packet *batadv_ogm_packet, - struct batadv_orig_node *orig_node) -{ - void *tvlv_value; - u16 tvlv_value_len; - - if (!batadv_ogm_packet) - return; - - tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len); - if (!tvlv_value_len) - return; - - tvlv_value = batadv_ogm_packet + 1; - - batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, - tvlv_value, tvlv_value_len); -} - -/** - * batadv_tvlv_handler_register - register tvlv handler based on the provided - * type and version (both need to match) for ogm tvlv payload and/or unicast - * payload - * @bat_priv: the bat priv with all the soft interface information - * @optr: ogm tvlv handler callback function. This function receives the orig - * node, flags and the tvlv content as argument to process. - * @uptr: unicast tvlv handler callback function. This function receives the - * source & destination of the unicast packet as well as the tvlv content - * to process. - * @type: tvlv handler type to be registered - * @version: tvlv handler version to be registered - * @flags: flags to enable or disable TVLV API behavior - */ -void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, - void (*optr)(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, - u16 tvlv_value_len), - int (*uptr)(struct batadv_priv *bat_priv, - u8 *src, u8 *dst, - void *tvlv_value, - u16 tvlv_value_len), - u8 type, u8 version, u8 flags) -{ - struct batadv_tvlv_handler *tvlv_handler; - - tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); - if (tvlv_handler) { - batadv_tvlv_handler_put(tvlv_handler); - return; - } - - tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); - if (!tvlv_handler) - return; - - tvlv_handler->ogm_handler = optr; - tvlv_handler->unicast_handler = uptr; - tvlv_handler->type = type; - tvlv_handler->version = version; - tvlv_handler->flags = flags; - kref_init(&tvlv_handler->refcount); - INIT_HLIST_NODE(&tvlv_handler->list); - - spin_lock_bh(&bat_priv->tvlv.handler_list_lock); - hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); - spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); -} - -/** - * batadv_tvlv_handler_unregister - unregister tvlv handler based on the - * provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv handler type to be unregistered - * @version: tvlv handler version to be unregistered - */ -void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, - u8 type, u8 version) -{ - struct batadv_tvlv_handler *tvlv_handler; - - tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); - if (!tvlv_handler) - return; - - batadv_tvlv_handler_put(tvlv_handler); - spin_lock_bh(&bat_priv->tvlv.handler_list_lock); - hlist_del_rcu(&tvlv_handler->list); - spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); - batadv_tvlv_handler_put(tvlv_handler); -} - -/** - * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the - * specified host - * @bat_priv: the bat priv with all the soft interface information - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet - * @type: tvlv type - * @version: tvlv version - * @tvlv_value: tvlv content - * @tvlv_value_len: tvlv content length - */ -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, - u8 *dst, u8 type, u8 version, - void *tvlv_value, u16 tvlv_value_len) -{ - struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; - struct batadv_tvlv_hdr *tvlv_hdr; - struct batadv_orig_node *orig_node; - struct sk_buff *skb; - unsigned char *tvlv_buff; - unsigned int tvlv_len; - ssize_t hdr_len = sizeof(*unicast_tvlv_packet); - - orig_node = batadv_orig_hash_find(bat_priv, dst); - if (!orig_node) - return; - - tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len; - - skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len); - if (!skb) - goto out; - - skb->priority = TC_PRIO_CONTROL; - skb_reserve(skb, ETH_HLEN); - tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len); - unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff; - unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV; - unicast_tvlv_packet->version = BATADV_COMPAT_VERSION; - unicast_tvlv_packet->ttl = BATADV_TTL; - unicast_tvlv_packet->reserved = 0; - unicast_tvlv_packet->tvlv_len = htons(tvlv_len); - unicast_tvlv_packet->align = 0; - ether_addr_copy(unicast_tvlv_packet->src, src); - ether_addr_copy(unicast_tvlv_packet->dst, dst); - - tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1); - tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff; - tvlv_hdr->version = version; - tvlv_hdr->type = type; - tvlv_hdr->len = htons(tvlv_value_len); - tvlv_buff += sizeof(*tvlv_hdr); - memcpy(tvlv_buff, tvlv_value, tvlv_value_len); - - if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) - kfree_skb(skb); -out: - batadv_orig_node_put(orig_node); -} - /** * batadv_get_vid - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet @@ -1284,36 +629,6 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) return ap_isolation_enabled; } -static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) -{ - struct batadv_algo_ops *bat_algo_ops; - char *algo_name = (char *)val; - size_t name_len = strlen(algo_name); - - if (name_len > 0 && algo_name[name_len - 1] == '\n') - algo_name[name_len - 1] = '\0'; - - bat_algo_ops = batadv_algo_get(algo_name); - if (!bat_algo_ops) { - pr_err("Routing algorithm '%s' is not supported\n", algo_name); - return -EINVAL; - } - - return param_set_copystring(algo_name, kp); -} - -static const struct kernel_param_ops batadv_param_ops_ra = { - .set = batadv_param_set_ra, - .get = param_get_string, -}; - -static struct kparam_string batadv_param_string_ra = { - .maxlen = sizeof(batadv_routing_algo), - .string = batadv_routing_algo, -}; - -module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, - 0644); module_init(batadv_init); module_exit(batadv_exit); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 76925266d..06a860845 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.2" +#define BATADV_SOURCE_VERSION "2016.3" #endif /* B.A.T.M.A.N. parameters */ @@ -100,6 +100,9 @@ #define BATADV_NUM_BCASTS_WIRELESS 3 #define BATADV_NUM_BCASTS_MAX 3 +/* length of the single packet used by the TP meter */ +#define BATADV_TP_PACKET_LEN ETH_DATA_LEN + /* msecs after which an ARP_REQUEST is sent in broadcast as fallback */ #define ARP_REQ_DELAY 250 /* numbers of originator to contact for any PUT/GET DHT operation */ @@ -131,6 +134,11 @@ #define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ +/** + * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions + */ +#define BATADV_TP_MAX_NUM 5 + enum batadv_mesh_state { BATADV_MESH_INACTIVE, BATADV_MESH_ACTIVE, @@ -175,29 +183,26 @@ enum batadv_uev_type { /* Kernel headers */ -#include #include /* for packet.h */ #include #include #include #include /* for packet.h */ -#include -#include -#include -#include -#include #include +#include +#include +#include #include "types.h" -struct batadv_ogm_packet; +struct net_device; +struct packet_type; struct seq_file; struct sk_buff; #define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \ (int)(vid & VLAN_VID_MASK) : -1) -extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; extern unsigned char batadv_broadcast_addr[]; @@ -218,73 +223,8 @@ batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); void batadv_recv_handler_unregister(u8 packet_type); -int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); -int batadv_algo_select(struct batadv_priv *bat_priv, char *name); -int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); -/** - * enum batadv_dbg_level - available log levels - * @BATADV_DBG_BATMAN: OGM and TQ computations related messages - * @BATADV_DBG_ROUTES: route added / changed / deleted - * @BATADV_DBG_TT: translation table messages - * @BATADV_DBG_BLA: bridge loop avoidance messages - * @BATADV_DBG_DAT: ARP snooping and DAT related messages - * @BATADV_DBG_NC: network coding related messages - * @BATADV_DBG_ALL: the union of all the above log levels - */ -enum batadv_dbg_level { - BATADV_DBG_BATMAN = BIT(0), - BATADV_DBG_ROUTES = BIT(1), - BATADV_DBG_TT = BIT(2), - BATADV_DBG_BLA = BIT(3), - BATADV_DBG_DAT = BIT(4), - BATADV_DBG_NC = BIT(5), - BATADV_DBG_ALL = 63, -}; - -#ifdef CONFIG_BATMAN_ADV_DEBUG -int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) -__printf(2, 3); - -/* possibly ratelimited debug output */ -#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ - do { \ - if (atomic_read(&bat_priv->log_level) & type && \ - (!ratelimited || net_ratelimit())) \ - batadv_debug_log(bat_priv, fmt, ## arg);\ - } \ - while (0) -#else /* !CONFIG_BATMAN_ADV_DEBUG */ -__printf(4, 5) -static inline void _batadv_dbg(int type __always_unused, - struct batadv_priv *bat_priv __always_unused, - int ratelimited __always_unused, - const char *fmt __always_unused, ...) -{ -} -#endif - -#define batadv_dbg(type, bat_priv, arg...) \ - _batadv_dbg(type, bat_priv, 0, ## arg) -#define batadv_dbg_ratelimited(type, bat_priv, arg...) \ - _batadv_dbg(type, bat_priv, 1, ## arg) - -#define batadv_info(net_dev, fmt, arg...) \ - do { \ - struct net_device *_netdev = (net_dev); \ - struct batadv_priv *_batpriv = netdev_priv(_netdev); \ - batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ - pr_info("%s: " fmt, _netdev->name, ## arg); \ - } while (0) -#define batadv_err(net_dev, fmt, arg...) \ - do { \ - struct net_device *_netdev = (net_dev); \ - struct batadv_priv *_batpriv = netdev_priv(_netdev); \ - batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ - pr_err("%s: " fmt, _netdev->name, ## arg); \ - } while (0) - /** * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses * @data1: Pointer to a six-byte array containing the Ethernet address @@ -370,39 +310,6 @@ static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) */ #define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) -void batadv_tvlv_container_register(struct batadv_priv *bat_priv, - u8 type, u8 version, - void *tvlv_value, u16 tvlv_value_len); -u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, - unsigned char **packet_buff, - int *packet_buff_len, int packet_min_len); -void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, - struct batadv_ogm_packet *batadv_ogm_packet, - struct batadv_orig_node *orig_node); -void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, - u8 type, u8 version); - -void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, - void (*optr)(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, - u16 tvlv_value_len), - int (*uptr)(struct batadv_priv *bat_priv, - u8 *src, u8 *dst, - void *tvlv_value, - u16 tvlv_value_len), - u8 type, u8 version, u8 flags); -void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, - u8 type, u8 version); -int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, - bool ogm_source, - struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_buff, u16 tvlv_buff_len); -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, - u8 *dst, u8 type, u8 version, - void *tvlv_value, u16 tvlv_value_len); unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index c32f24faf..cc915073a 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -25,17 +25,23 @@ #include #include #include +#include +#include #include -#include +#include #include +#include #include #include +#include #include #include #include #include +#include #include #include +#include #include #include #include @@ -43,18 +49,57 @@ #include #include #include +#include +#include #include +#include "hard-interface.h" +#include "hash.h" +#include "log.h" #include "packet.h" #include "translation-table.h" +#include "tvlv.h" + +/** + * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists + * @soft_iface: netdev struct of the mesh interface + * + * If the given soft interface has a bridge on top then the refcount + * of the according net device is increased. + * + * Return: NULL if no such bridge exists. Otherwise the net device of the + * bridge. + */ +static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) +{ + struct net_device *upper = soft_iface; + + rcu_read_lock(); + do { + upper = netdev_master_upper_dev_get_rcu(upper); + } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); + + if (upper) + dev_hold(upper); + rcu_read_unlock(); + + return upper; +} /** * batadv_mcast_mla_softif_get - get softif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * - * Collect multicast addresses of the local multicast listeners - * on the given soft interface, dev, in the given mcast_list. + * Collects multicast addresses of multicast listeners residing + * on this kernel on the given soft interface, dev, in + * the given mcast_list. In general, multicast listeners provided by + * your multicast receiving applications run directly on this node. + * + * If there is a bridge interface on top of dev, collects from that one + * instead. Just like with IP addresses and routes, multicast listeners + * will(/should) register to the bridge interface instead of an + * enslaved bat0. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. @@ -62,12 +107,13 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, struct hlist_head *mcast_list) { + struct net_device *bridge = batadv_mcast_get_bridge(dev); struct netdev_hw_addr *mc_list_entry; struct batadv_hw_addr *new; int ret = 0; - netif_addr_lock_bh(dev); - netdev_for_each_mc_addr(mc_list_entry, dev) { + netif_addr_lock_bh(bridge ? bridge : dev); + netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) { new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; @@ -78,7 +124,10 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, hlist_add_head(&new->list, mcast_list); ret++; } - netif_addr_unlock_bh(dev); + netif_addr_unlock_bh(bridge ? bridge : dev); + + if (bridge) + dev_put(bridge); return ret; } @@ -103,6 +152,83 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, return false; } +/** + * batadv_mcast_mla_br_addr_cpy - copy a bridge multicast address + * @dst: destination to write to - a multicast MAC address + * @src: source to read from - a multicast IP address + * + * Converts a given multicast IPv4/IPv6 address from a bridge + * to its matching multicast MAC address and copies it into the given + * destination buffer. + * + * Caller needs to make sure the destination buffer can hold + * at least ETH_ALEN bytes. + */ +static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) +{ + if (src->proto == htons(ETH_P_IP)) + ip_eth_mc_map(src->u.ip4, dst); +#if IS_ENABLED(CONFIG_IPV6) + else if (src->proto == htons(ETH_P_IPV6)) + ipv6_eth_mc_map(&src->u.ip6, dst); +#endif + else + eth_zero_addr(dst); +} + +/** + * batadv_mcast_mla_bridge_get - get bridged-in multicast listeners + * @dev: a bridge slave whose bridge to collect multicast addresses from + * @mcast_list: a list to put found addresses into + * + * Collects multicast addresses of multicast listeners residing + * on foreign, non-mesh devices which we gave access to our mesh via + * a bridge on top of the given soft interface, dev, in the given + * mcast_list. + * + * Return: -ENOMEM on memory allocation error or the number of + * items added to the mcast_list otherwise. + */ +static int batadv_mcast_mla_bridge_get(struct net_device *dev, + struct hlist_head *mcast_list) +{ + struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); + struct br_ip_list *br_ip_entry, *tmp; + struct batadv_hw_addr *new; + u8 mcast_addr[ETH_ALEN]; + int ret; + + /* we don't need to detect these devices/listeners, the IGMP/MLD + * snooping code of the Linux bridge already does that for us + */ + ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); + if (ret < 0) + goto out; + + list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { + batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); + if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) + continue; + + new = kmalloc(sizeof(*new), GFP_ATOMIC); + if (!new) { + ret = -ENOMEM; + break; + } + + ether_addr_copy(new->addr, mcast_addr); + hlist_add_head(&new->list, mcast_list); + } + +out: + list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { + list_del(&br_ip_entry->list); + kfree(br_ip_entry); + } + + return ret; +} + /** * batadv_mcast_mla_list_free - free a list of multicast addresses * @bat_priv: the bat priv with all the soft interface information @@ -213,6 +339,122 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) return upper; } +/** + * batadv_mcast_querier_log - debug output regarding the querier status on link + * @bat_priv: the bat priv with all the soft interface information + * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") + * @old_state: the previous querier state on our link + * @new_state: the new querier state on our link + * + * Outputs debug messages to the logging facility with log level 'mcast' + * regarding changes to the querier status on the link which are relevant + * to our multicast optimizations. + * + * Usually this is about whether a querier appeared or vanished in + * our mesh or whether the querier is in the suboptimal position of being + * behind our local bridge segment: Snooping switches will directly + * forward listener reports to the querier, therefore batman-adv and + * the bridge will potentially not see these listeners - the querier is + * potentially shadowing listeners from us then. + * + * This is only interesting for nodes with a bridge on top of their + * soft interface. + */ +static void +batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, + struct batadv_mcast_querier_state *old_state, + struct batadv_mcast_querier_state *new_state) +{ + if (!old_state->exists && new_state->exists) + batadv_info(bat_priv->soft_iface, "%s Querier appeared\n", + str_proto); + else if (old_state->exists && !new_state->exists) + batadv_info(bat_priv->soft_iface, + "%s Querier disappeared - multicast optimizations disabled\n", + str_proto); + else if (!bat_priv->mcast.bridged && !new_state->exists) + batadv_info(bat_priv->soft_iface, + "No %s Querier present - multicast optimizations disabled\n", + str_proto); + + if (new_state->exists) { + if ((!old_state->shadowing && new_state->shadowing) || + (!old_state->exists && new_state->shadowing)) + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "%s Querier is behind our bridged segment: Might shadow listeners\n", + str_proto); + else if (old_state->shadowing && !new_state->shadowing) + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "%s Querier is not behind our bridged segment\n", + str_proto); + } +} + +/** + * batadv_mcast_bridge_log - debug output for topology changes in bridged setups + * @bat_priv: the bat priv with all the soft interface information + * @bridged: a flag about whether the soft interface is currently bridged or not + * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier + * @querier_ipv6: (maybe) new status of a potential, selected MLD querier + * + * If no bridges are ever used on this node, then this function does nothing. + * + * Otherwise this function outputs debug information to the 'mcast' log level + * which might be relevant to our multicast optimizations. + * + * More precisely, it outputs information when a bridge interface is added or + * removed from a soft interface. And when a bridge is present, it further + * outputs information about the querier state which is relevant for the + * multicast flags this node is going to set. + */ +static void +batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged, + struct batadv_mcast_querier_state *querier_ipv4, + struct batadv_mcast_querier_state *querier_ipv6) +{ + if (!bat_priv->mcast.bridged && bridged) + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Bridge added: Setting Unsnoopables(U)-flag\n"); + else if (bat_priv->mcast.bridged && !bridged) + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); + + if (bridged) { + batadv_mcast_querier_log(bat_priv, "IGMP", + &bat_priv->mcast.querier_ipv4, + querier_ipv4); + batadv_mcast_querier_log(bat_priv, "MLD", + &bat_priv->mcast.querier_ipv6, + querier_ipv6); + } +} + +/** + * batadv_mcast_flags_logs - output debug information about mcast flag changes + * @bat_priv: the bat priv with all the soft interface information + * @flags: flags indicating the new multicast state + * + * Whenever the multicast flags this nodes announces changes (@mcast_flags vs. + * bat_priv->mcast.flags), this notifies userspace via the 'mcast' log level. + */ +static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) +{ + u8 old_flags = bat_priv->mcast.flags; + char str_old_flags[] = "[...]"; + + sprintf(str_old_flags, "[%c%c%c]", + (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', + (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', + (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Changing multicast flags from '%s' to '[%c%c%c]'\n", + bat_priv->mcast.enabled ? str_old_flags : "", + (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); +} + /** * batadv_mcast_mla_tvlv_update - update multicast tvlv * @bat_priv: the bat priv with all the soft interface information @@ -220,38 +462,73 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. * - * Return: true if the tvlv container is registered afterwards. Otherwise - * returns false. + * Return: false if we want all IPv4 && IPv6 multicast traffic and true + * otherwise. */ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) { struct batadv_tvlv_mcast_data mcast_data; + struct batadv_mcast_querier_state querier4 = {false, false}; + struct batadv_mcast_querier_state querier6 = {false, false}; + struct net_device *dev = bat_priv->soft_iface; + bool bridged; mcast_data.flags = BATADV_NO_FLAGS; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); - /* Avoid attaching MLAs, if there is a bridge on top of our soft - * interface, we don't support that yet (TODO) + bridged = batadv_mcast_has_bridge(bat_priv); + if (!bridged) + goto update; + +#if !IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) + pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); +#endif + + querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); + querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); + + querier6.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); + querier6.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); + + mcast_data.flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; + + /* 1) If no querier exists at all, then multicast listeners on + * our local TT clients behind the bridge will keep silent. + * 2) If the selected querier is on one of our local TT clients, + * behind the bridge, then this querier might shadow multicast + * listeners on our local TT clients, behind this bridge. + * + * In both cases, we will signalize other batman nodes that + * we need all multicast traffic of the according protocol. */ - if (batadv_mcast_has_bridge(bat_priv)) { - if (bat_priv->mcast.enabled) { - batadv_tvlv_container_unregister(bat_priv, - BATADV_TVLV_MCAST, 1); - bat_priv->mcast.enabled = false; - } + if (!querier4.exists || querier4.shadowing) + mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV4; - return false; - } + if (!querier6.exists || querier6.shadowing) + mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6; + +update: + batadv_mcast_bridge_log(bat_priv, bridged, &querier4, &querier6); + + bat_priv->mcast.querier_ipv4.exists = querier4.exists; + bat_priv->mcast.querier_ipv4.shadowing = querier4.shadowing; + + bat_priv->mcast.querier_ipv6.exists = querier6.exists; + bat_priv->mcast.querier_ipv6.shadowing = querier6.shadowing; + + bat_priv->mcast.bridged = bridged; if (!bat_priv->mcast.enabled || mcast_data.flags != bat_priv->mcast.flags) { - batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 1, + batadv_mcast_flags_log(bat_priv, mcast_data.flags); + batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.flags = mcast_data.flags; bat_priv->mcast.enabled = true; } - return true; + return !(mcast_data.flags & + (BATADV_MCAST_WANT_ALL_IPV4 + BATADV_MCAST_WANT_ALL_IPV6)); } /** @@ -274,6 +551,10 @@ void batadv_mcast_mla_update(struct batadv_priv *bat_priv) if (ret < 0) goto out; + ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list); + if (ret < 0) + goto out; + update: batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); @@ -282,6 +563,31 @@ out: batadv_mcast_mla_list_free(bat_priv, &mcast_list); } +/** + * batadv_mcast_is_report_ipv4 - check for IGMP reports + * @skb: the ethernet frame destined for the mesh + * + * This call might reallocate skb data. + * + * Checks whether the given frame is a valid IGMP report. + * + * Return: If so then true, otherwise false. + */ +static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) +{ + if (ip_mc_check_igmp(skb, NULL) < 0) + return false; + + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return true; + } + + return false; +} + /** * batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information @@ -304,6 +610,9 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM; + if (batadv_mcast_is_report_ipv4(skb)) + return -EINVAL; + iphdr = ip_hdr(skb); /* TODO: Implement Multicast Router Discovery (RFC4286), @@ -320,6 +629,31 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, return 0; } +#if IS_ENABLED(CONFIG_IPV6) +/** + * batadv_mcast_is_report_ipv6 - check for MLD reports + * @skb: the ethernet frame destined for the mesh + * + * This call might reallocate skb data. + * + * Checks whether the given frame is a valid MLD report. + * + * Return: If so then true, otherwise false. + */ +static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) +{ + if (ipv6_mc_check_mld(skb, NULL) < 0) + return false; + + switch (icmp6_hdr(skb)->icmp6_type) { + case ICMPV6_MGM_REPORT: + case ICMPV6_MLD2_REPORT: + return true; + } + + return false; +} + /** * batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information @@ -341,6 +675,9 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM; + if (batadv_mcast_is_report_ipv6(skb)) + return -EINVAL; + ip6hdr = ipv6_hdr(skb); /* TODO: Implement Multicast Router Discovery (RFC4286), @@ -357,6 +694,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, return 0; } +#endif /** * batadv_mcast_forw_mode_check - check for optimized forwarding potential @@ -385,9 +723,11 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable); +#if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable); +#endif default: return -EINVAL; } @@ -728,18 +1068,18 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, } /** - * batadv_mcast_tvlv_ogm_handler_v1 - process incoming multicast tvlv container + * batadv_mcast_tvlv_ogm_handler - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data * @tvlv_value_len: tvlv buffer length */ -static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, - u16 tvlv_value_len) +static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 flags, + void *tvlv_value, + u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags = BATADV_NO_FLAGS; @@ -789,19 +1129,120 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, */ void batadv_mcast_init(struct batadv_priv *bat_priv) { - batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_MCAST, 1, + batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, + NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); } +/** + * batadv_mcast_flags_print_header - print own mcast flags to debugfs table + * @bat_priv: the bat priv with all the soft interface information + * @seq: debugfs table seq_file struct + * + * Prints our own multicast flags including a more specific reason why + * they are set, that is prints the bridge and querier state too, to + * the debugfs table specified via @seq. + */ +static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + u8 flags = bat_priv->mcast.flags; + char querier4, querier6, shadowing4, shadowing6; + bool bridged = bat_priv->mcast.bridged; + + if (bridged) { + querier4 = bat_priv->mcast.querier_ipv4.exists ? '.' : '4'; + querier6 = bat_priv->mcast.querier_ipv6.exists ? '.' : '6'; + shadowing4 = bat_priv->mcast.querier_ipv4.shadowing ? '4' : '.'; + shadowing6 = bat_priv->mcast.querier_ipv6.shadowing ? '6' : '.'; + } else { + querier4 = '?'; + querier6 = '?'; + shadowing4 = '?'; + shadowing6 = '?'; + } + + seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n", + (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.'); + seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n", + querier4, querier6); + seq_printf(seq, "* Shadowing IGMP/MLD Querier [4/6]:\t%c/%c\n", + shadowing4, shadowing6); + seq_puts(seq, "-------------------------------------------\n"); + seq_printf(seq, " %-10s %s\n", "Originator", "Flags"); +} + +/** + * batadv_mcast_flags_seq_print_text - print the mcast flags of other nodes + * @seq: seq file to print on + * @offset: not used + * + * This prints a table of (primary) originators and their according + * multicast flags, including (in the header) our own. + * + * Return: always 0 + */ +int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_priv *bat_priv = netdev_priv(net_dev); + struct batadv_hard_iface *primary_if; + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct batadv_orig_node *orig_node; + struct hlist_head *head; + u8 flags; + u32 i; + + primary_if = batadv_seq_print_text_primary_if_get(seq); + if (!primary_if) + return 0; + + batadv_mcast_flags_print_header(bat_priv, seq); + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capa_initialized)) + continue; + + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capabilities)) { + seq_printf(seq, "%pM -\n", orig_node->orig); + continue; + } + + flags = orig_node->mcast_flags; + + seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig, + (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) + ? 'U' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV4) + ? '4' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV6) + ? '6' : '.'); + } + rcu_read_unlock(); + } + + batadv_hardif_put(primary_if); + + return 0; +} + /** * batadv_mcast_free - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) { - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1); - batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1); + batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); + batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); spin_lock_bh(&bat_priv->tt.commit_lock); batadv_mcast_mla_tt_retract(bat_priv, NULL); diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 80bceec55..1fb00ba84 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -20,6 +20,7 @@ #include "main.h" +struct seq_file; struct sk_buff; /** @@ -46,6 +47,8 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, void batadv_mcast_init(struct batadv_priv *bat_priv); +int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset); + void batadv_mcast_free(struct batadv_priv *bat_priv); void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c new file mode 100644 index 000000000..231f8eaf0 --- /dev/null +++ b/net/batman-adv/netlink.c @@ -0,0 +1,424 @@ +/* Copyright (C) 2016 B.A.T.M.A.N. contributors: + * + * Matthias Schiffer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "netlink.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hard-interface.h" +#include "soft-interface.h" +#include "tp_meter.h" + +struct sk_buff; + +static struct genl_family batadv_netlink_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = BATADV_NL_NAME, + .version = 1, + .maxattr = BATADV_ATTR_MAX, +}; + +/* multicast groups */ +enum batadv_netlink_multicast_groups { + BATADV_NL_MCGRP_TPMETER, +}; + +static struct genl_multicast_group batadv_netlink_mcgrps[] = { + [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER }, +}; + +static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { + [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, + [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, + [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, + [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, + [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, +}; + +/** + * batadv_netlink_mesh_info_put - fill in generic information about mesh + * interface + * @msg: netlink message to be sent back + * @soft_iface: interface for which the data should be taken + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_hard_iface *primary_if = NULL; + struct net_device *hard_iface; + int ret = -ENOBUFS; + + if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) || + nla_put_string(msg, BATADV_ATTR_ALGO_NAME, + bat_priv->algo_ops->name) || + nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || + nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || + nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, + soft_iface->dev_addr)) + goto out; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { + hard_iface = primary_if->net_dev; + + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + hard_iface->ifindex) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + hard_iface->name) || + nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, + hard_iface->dev_addr)) + goto out; + } + + ret = 0; + + out: + if (primary_if) + batadv_hardif_put(primary_if); + + return ret; +} + +/** + * batadv_netlink_get_mesh_info - handle incoming BATADV_CMD_GET_MESH_INFO + * netlink request + * @skb: received netlink message + * @info: receiver information + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct net_device *soft_iface; + struct sk_buff *msg = NULL; + void *msg_head; + int ifindex; + int ret; + + if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &batadv_netlink_family, 0, + BATADV_CMD_GET_MESH_INFO); + if (!msg_head) { + ret = -ENOBUFS; + goto out; + } + + ret = batadv_netlink_mesh_info_put(msg, soft_iface); + + out: + if (soft_iface) + dev_put(soft_iface); + + if (ret) { + if (msg) + nlmsg_free(msg); + return ret; + } + + genlmsg_end(msg, msg_head); + return genlmsg_reply(msg, info); +} + +/** + * batadv_netlink_tp_meter_put - Fill information of started tp_meter session + * @msg: netlink message to be sent back + * @cookie: tp meter session cookie + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) +{ + if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) + return -ENOBUFS; + + return 0; +} + +/** + * batadv_netlink_tpmeter_notify - send tp_meter result via netlink to client + * @bat_priv: the bat priv with all the soft interface information + * @dst: destination of tp_meter session + * @result: reason for tp meter session stop + * @test_time: total time ot the tp_meter session + * @total_bytes: bytes acked to the receiver + * @cookie: cookie of tp_meter session + * + * Return: 0 on success, < 0 on error + */ +int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, + u8 result, u32 test_time, u64 total_bytes, + u32 cookie) +{ + struct sk_buff *msg; + void *hdr; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &batadv_netlink_family, 0, + BATADV_CMD_TP_METER); + if (!hdr) { + ret = -ENOBUFS; + goto err_genlmsg; + } + + if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) + goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_TPMETER_TEST_TIME, test_time)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, BATADV_ATTR_TPMETER_BYTES, total_bytes, + BATADV_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u8(msg, BATADV_ATTR_TPMETER_RESULT, result)) + goto nla_put_failure; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, dst)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&batadv_netlink_family, + dev_net(bat_priv->soft_iface), msg, 0, + BATADV_NL_MCGRP_TPMETER, GFP_KERNEL); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + ret = -EMSGSIZE; + +err_genlmsg: + nlmsg_free(msg); + return ret; +} + +/** + * batadv_netlink_tp_meter_start - Start a new tp_meter session + * @skb: received netlink message + * @info: receiver information + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + struct sk_buff *msg = NULL; + u32 test_length; + void *msg_head; + int ifindex; + u32 cookie; + u8 *dst; + int ret; + + if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) + return -EINVAL; + + if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) + return -EINVAL; + + if (!info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); + if (!ifindex) + return -EINVAL; + + dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); + + test_length = nla_get_u32(info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]); + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &batadv_netlink_family, 0, + BATADV_CMD_TP_METER); + if (!msg_head) { + ret = -ENOBUFS; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + batadv_tp_start(bat_priv, dst, test_length, &cookie); + + ret = batadv_netlink_tp_meter_put(msg, cookie); + + out: + if (soft_iface) + dev_put(soft_iface); + + if (ret) { + if (msg) + nlmsg_free(msg); + return ret; + } + + genlmsg_end(msg, msg_head); + return genlmsg_reply(msg, info); +} + +/** + * batadv_netlink_tp_meter_start - Cancel a running tp_meter session + * @skb: received netlink message + * @info: receiver information + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + int ifindex; + u8 *dst; + int ret = 0; + + if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) + return -EINVAL; + + if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); + if (!ifindex) + return -EINVAL; + + dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + batadv_tp_stop(bat_priv, dst, BATADV_TP_REASON_CANCEL); + +out: + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + +static struct genl_ops batadv_netlink_ops[] = { + { + .cmd = BATADV_CMD_GET_MESH_INFO, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_get_mesh_info, + }, + { + .cmd = BATADV_CMD_TP_METER, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_tp_meter_start, + }, + { + .cmd = BATADV_CMD_TP_METER_CANCEL, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_tp_meter_cancel, + }, +}; + +/** + * batadv_netlink_register - register batadv genl netlink family + */ +void __init batadv_netlink_register(void) +{ + int ret; + + ret = genl_register_family_with_ops_groups(&batadv_netlink_family, + batadv_netlink_ops, + batadv_netlink_mcgrps); + if (ret) + pr_warn("unable to register netlink family"); +} + +/** + * batadv_netlink_unregister - unregister batadv genl netlink family + */ +void batadv_netlink_unregister(void) +{ + genl_unregister_family(&batadv_netlink_family); +} diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h new file mode 100644 index 000000000..945653ab5 --- /dev/null +++ b/net/batman-adv/netlink.h @@ -0,0 +1,32 @@ +/* Copyright (C) 2016 B.A.T.M.A.N. contributors: + * + * Matthias Schiffer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_NETLINK_H_ +#define _NET_BATMAN_ADV_NETLINK_H_ + +#include "main.h" + +#include + +void batadv_netlink_register(void); +void batadv_netlink_unregister(void); + +int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, + u8 result, u32 test_time, u64 total_bytes, + u32 cookie); + +#endif /* _NET_BATMAN_ADV_NETLINK_H_ */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 678f06865..293ef4ffd 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -51,10 +51,12 @@ #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "routing.h" #include "send.h" +#include "tvlv.h" static struct lock_class_key batadv_nc_coding_hash_lock_class_key; static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index ab8c4f973..3940b5d24 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -34,11 +34,13 @@ #include #include +#include "bat_algo.h" #include "distributed-arp-table.h" #include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "multicast.h" #include "network-coding.h" #include "routing.h" @@ -251,10 +253,8 @@ static void batadv_neigh_node_release(struct kref *ref) struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; - struct batadv_algo_ops *bao; neigh_node = container_of(ref, struct batadv_neigh_node, refcount); - bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { @@ -263,9 +263,6 @@ static void batadv_neigh_node_release(struct kref *ref) batadv_hardif_neigh_put(neigh_node->hardif_neigh); - if (bao->bat_neigh_free) - bao->bat_neigh_free(neigh_node); - batadv_hardif_put(neigh_node->if_incoming); kfree_rcu(neigh_node, rcu); @@ -537,8 +534,8 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, kref_init(&hardif_neigh->refcount); - if (bat_priv->bat_algo_ops->bat_hardif_neigh_init) - bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh); + if (bat_priv->algo_ops->neigh.hardif_init) + bat_priv->algo_ops->neigh.hardif_init(hardif_neigh); hlist_add_head(&hardif_neigh->list, &hard_iface->neigh_list); @@ -602,19 +599,19 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, } /** - * batadv_neigh_node_new - create and init a new neigh_node object + * batadv_neigh_node_create - create a neigh node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface * * Allocates a new neigh_node object and initialises all the generic fields. * - * Return: neighbor when found. Othwerwise NULL + * Return: the neighbour node if found or created or NULL otherwise. */ -struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_orig_node *orig_node, - struct batadv_hard_iface *hard_iface, - const u8 *neigh_addr) +static struct batadv_neigh_node * +batadv_neigh_node_create(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) { struct batadv_neigh_node *neigh_node; struct batadv_hardif_neigh_node *hardif_neigh = NULL; @@ -666,6 +663,29 @@ out: return neigh_node; } +/** + * batadv_neigh_node_get_or_create - retrieve or create a neigh node object + * @orig_node: originator object representing the neighbour + * @hard_iface: the interface where the neighbour is connected to + * @neigh_addr: the mac address of the neighbour interface + * + * Return: the neighbour node if found or created or NULL otherwise. + */ +struct batadv_neigh_node * +batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) +{ + struct batadv_neigh_node *neigh_node = NULL; + + /* first check without locking to avoid the overhead */ + neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); + if (neigh_node) + return neigh_node; + + return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr); +} + /** * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list * @seq: neighbour table seq_file struct @@ -686,17 +706,17 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", BATADV_SOURCE_VERSION, primary_if->net_dev->name, primary_if->net_dev->dev_addr, net_dev->name, - bat_priv->bat_algo_ops->name); + bat_priv->algo_ops->name); batadv_hardif_put(primary_if); - if (!bat_priv->bat_algo_ops->bat_neigh_print) { + if (!bat_priv->algo_ops->neigh.print) { seq_puts(seq, "No printing function for this routing protocol\n"); return 0; } - bat_priv->bat_algo_ops->bat_neigh_print(bat_priv, seq); + bat_priv->algo_ops->neigh.print(bat_priv, seq); return 0; } @@ -747,8 +767,8 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) batadv_frag_purge_orig(orig_node, NULL); - if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) - orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); + if (orig_node->bat_priv->algo_ops->orig.free) + orig_node->bat_priv->algo_ops->orig.free(orig_node); kfree(orig_node->tt_buff); kfree(orig_node); @@ -1092,12 +1112,12 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *best = NULL, *neigh; - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; rcu_read_lock(); hlist_for_each_entry_rcu(neigh, &orig_node->neigh_list, list) { - if (best && (bao->bat_neigh_cmp(neigh, if_outgoing, - best, if_outgoing) <= 0)) + if (best && (bao->neigh.cmp(neigh, if_outgoing, best, + if_outgoing) <= 0)) continue; if (!kref_get_unless_zero(&neigh->refcount)) @@ -1249,18 +1269,17 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", BATADV_SOURCE_VERSION, primary_if->net_dev->name, primary_if->net_dev->dev_addr, net_dev->name, - bat_priv->bat_algo_ops->name); + bat_priv->algo_ops->name); batadv_hardif_put(primary_if); - if (!bat_priv->bat_algo_ops->bat_orig_print) { + if (!bat_priv->algo_ops->orig.print) { seq_puts(seq, "No printing function for this routing protocol\n"); return 0; } - bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, - BATADV_IF_DEFAULT); + bat_priv->algo_ops->orig.print(bat_priv, seq, BATADV_IF_DEFAULT); return 0; } @@ -1287,7 +1306,7 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) } bat_priv = netdev_priv(hard_iface->soft_iface); - if (!bat_priv->bat_algo_ops->bat_orig_print) { + if (!bat_priv->algo_ops->orig.print) { seq_puts(seq, "No printing function for this routing protocol\n"); goto out; @@ -1301,9 +1320,9 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "[B.A.T.M.A.N. adv %s, IF/MAC: %s/%pM (%s %s)]\n", BATADV_SOURCE_VERSION, hard_iface->net_dev->name, hard_iface->net_dev->dev_addr, - hard_iface->soft_iface->name, bat_priv->bat_algo_ops->name); + hard_iface->soft_iface->name, bat_priv->algo_ops->name); - bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, hard_iface); + bat_priv->algo_ops->orig.print(bat_priv, seq, hard_iface); out: if (hard_iface) @@ -1315,7 +1334,7 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; @@ -1331,9 +1350,8 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { ret = 0; - if (bao->bat_orig_add_if) - ret = bao->bat_orig_add_if(orig_node, - max_if_num); + if (bao->orig.add_if) + ret = bao->orig.add_if(orig_node, max_if_num); if (ret == -ENOMEM) goto err; } @@ -1355,7 +1373,7 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, struct hlist_head *head; struct batadv_hard_iface *hard_iface_tmp; struct batadv_orig_node *orig_node; - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; u32 i; int ret; @@ -1368,10 +1386,9 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { ret = 0; - if (bao->bat_orig_del_if) - ret = bao->bat_orig_del_if(orig_node, - max_if_num, - hard_iface->if_num); + if (bao->orig.del_if) + ret = bao->orig.del_if(orig_node, max_if_num, + hard_iface->if_num); if (ret == -ENOMEM) goto err; } diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 64a8951e5..566306bf0 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -46,9 +46,9 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh); struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_orig_node *orig_node, - struct batadv_hard_iface *hard_iface, - const u8 *neigh_addr); +batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr); void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 372128ddb..6b011ff64 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -21,6 +21,8 @@ #include #include +#define batadv_tp_is_error(n) ((u8)n > 127 ? 1 : 0) + /** * enum batadv_packettype - types for batman-adv encapsulated packets * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV @@ -93,6 +95,7 @@ enum batadv_icmp_packettype { BATADV_ECHO_REQUEST = 8, BATADV_TTL_EXCEEDED = 11, BATADV_PARAMETER_PROBLEM = 12, + BATADV_TP = 15, }; /** @@ -284,6 +287,16 @@ struct batadv_elp_packet { #define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet) +/** + * enum batadv_icmp_user_cmd_type - types for batman-adv icmp cmd modes + * @BATADV_TP_START: start a throughput meter run + * @BATADV_TP_STOP: stop a throughput meter run + */ +enum batadv_icmp_user_cmd_type { + BATADV_TP_START = 0, + BATADV_TP_STOP = 2, +}; + /** * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header @@ -334,6 +347,47 @@ struct batadv_icmp_packet { __be16 seqno; }; +/** + * struct batadv_icmp_tp_packet - ICMP TP Meter packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @msg_type: ICMP packet type + * @dst: address of the destination node + * @orig: address of the source node + * @uid: local ICMP socket identifier + * @subtype: TP packet subtype (see batadv_icmp_tp_subtype) + * @session: TP session identifier + * @seqno: the TP sequence number + * @timestamp: time when the packet has been sent. This value is filled in a + * TP_MSG and echoed back in the next TP_ACK so that the sender can compute the + * RTT. Since it is read only by the host which wrote it, there is no need to + * store it using network order + */ +struct batadv_icmp_tp_packet { + u8 packet_type; + u8 version; + u8 ttl; + u8 msg_type; /* see ICMP message types above */ + u8 dst[ETH_ALEN]; + u8 orig[ETH_ALEN]; + u8 uid; + u8 subtype; + u8 session[2]; + __be32 seqno; + __be32 timestamp; +}; + +/** + * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes + * @BATADV_TP_MSG: Msg from sender to receiver + * @BATADV_TP_ACK: acknowledgment from receiver to sender + */ +enum batadv_icmp_tp_subtype { + BATADV_TP_MSG = 0, + BATADV_TP_ACK, +}; + #define BATADV_RR_LEN 16 /** @@ -420,6 +474,7 @@ struct batadv_unicast_4addr_packet { * @dest: final destination used when routing fragments * @orig: originator of the fragment used when merging the packet * @no: fragment number within this sequence + * @priority: priority of frame, from ToS IP precedence or 802.1p * @reserved: reserved byte for alignment * @seqno: sequence identification * @total_size: size of the merged packet @@ -430,9 +485,11 @@ struct batadv_frag_packet { u8 ttl; #if defined(__BIG_ENDIAN_BITFIELD) u8 no:4; - u8 reserved:4; + u8 priority:3; + u8 reserved:1; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 reserved:4; + u8 reserved:1; + u8 priority:3; u8 no:4; #else #error "unknown bitfield endianness" diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index bfac086b4..3d199478c 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -40,12 +40,15 @@ #include "fragmentation.h" #include "hard-interface.h" #include "icmp_socket.h" +#include "log.h" #include "network-coding.h" #include "originator.h" #include "packet.h" #include "send.h" #include "soft-interface.h" +#include "tp_meter.h" #include "translation-table.h" +#include "tvlv.h" static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); @@ -268,10 +271,19 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, icmph->ttl = BATADV_TTL; res = batadv_send_skb_to_orig(skb, orig_node, NULL); - if (res != NET_XMIT_DROP) - ret = NET_RX_SUCCESS; + if (res == -1) + goto out; + + ret = NET_RX_SUCCESS; break; + case BATADV_TP: + if (!pskb_may_pull(skb, sizeof(struct batadv_icmp_tp_packet))) + goto out; + + batadv_tp_meter_recv(bat_priv, skb); + ret = NET_RX_SUCCESS; + goto out; default: /* drop unknown type */ goto out; @@ -290,7 +302,7 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if = NULL; struct batadv_orig_node *orig_node = NULL; struct batadv_icmp_packet *icmp_packet; - int ret = NET_RX_DROP; + int res, ret = NET_RX_DROP; icmp_packet = (struct batadv_icmp_packet *)skb->data; @@ -321,7 +333,8 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, icmp_packet->msg_type = BATADV_TTL_EXCEEDED; icmp_packet->ttl = BATADV_TTL; - if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) + res = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (res != -1) ret = NET_RX_SUCCESS; out: @@ -341,7 +354,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, struct ethhdr *ethhdr; struct batadv_orig_node *orig_node = NULL; int hdr_size = sizeof(struct batadv_icmp_header); - int ret = NET_RX_DROP; + int res, ret = NET_RX_DROP; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) @@ -408,7 +421,8 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, icmph->ttl--; /* route it */ - if (batadv_send_skb_to_orig(skb, orig_node, recv_if) != NET_XMIT_DROP) + res = batadv_send_skb_to_orig(skb, orig_node, recv_if); + if (res != -1) ret = NET_RX_SUCCESS; out: @@ -455,6 +469,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, return 0; } +/** + * batadv_last_bonding_get - Get last_bonding_candidate of orig_node + * @orig_node: originator node whose last bonding candidate should be retrieved + * + * Return: last bonding candidate of router or NULL if not found + * + * The object is returned with refcounter increased by 1. + */ +static struct batadv_orig_ifinfo * +batadv_last_bonding_get(struct batadv_orig_node *orig_node) +{ + struct batadv_orig_ifinfo *last_bonding_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + last_bonding_candidate = orig_node->last_bonding_candidate; + + if (last_bonding_candidate) + kref_get(&last_bonding_candidate->refcount); + spin_unlock_bh(&orig_node->neigh_list_lock); + + return last_bonding_candidate; +} + /** * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node * @orig_node: originator node whose bonding candidates should be replaced @@ -492,7 +529,7 @@ batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if) { - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; struct batadv_neigh_node *first_candidate_router = NULL; struct batadv_neigh_node *next_candidate_router = NULL; struct batadv_neigh_node *router, *cand_router = NULL; @@ -525,7 +562,7 @@ batadv_find_router(struct batadv_priv *bat_priv, * router - obviously there are no other candidates. */ rcu_read_lock(); - last_candidate = orig_node->last_bonding_candidate; + last_candidate = batadv_last_bonding_get(orig_node); if (last_candidate) last_cand_router = rcu_dereference(last_candidate->router); @@ -546,9 +583,9 @@ batadv_find_router(struct batadv_priv *bat_priv, /* alternative candidate should be good enough to be * considered */ - if (!bao->bat_neigh_is_similar_or_better(cand_router, - cand->if_outgoing, - router, recv_if)) + if (!bao->neigh.is_similar_or_better(cand_router, + cand->if_outgoing, router, + recv_if)) goto next; /* don't use the same router twice */ @@ -617,6 +654,9 @@ next: batadv_orig_ifinfo_put(next_candidate); } + if (last_candidate) + batadv_orig_ifinfo_put(last_candidate); + return router; } @@ -671,6 +711,8 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, len = skb->len; res = batadv_send_skb_to_orig(skb, orig_node, recv_if); + if (res == -1) + goto out; /* translate transmit result into receive result */ if (res == NET_XMIT_SUCCESS) { @@ -678,13 +720,10 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, len + ETH_HLEN); - - ret = NET_RX_SUCCESS; - } else if (res == NET_XMIT_POLICED) { - /* skb was buffered and consumed */ - ret = NET_RX_SUCCESS; } + ret = NET_RX_SUCCESS; + out: if (orig_node) batadv_orig_node_put(orig_node); @@ -1033,6 +1072,8 @@ int batadv_recv_frag_packet(struct sk_buff *skb, if (!orig_node_src) goto out; + skb->priority = frag_packet->priority + 256; + /* Route the fragment if it is not for us and too big to be merged. */ if (!batadv_is_my_mac(bat_priv, frag_packet->dest) && batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) { diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 010397650..619115948 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -20,10 +20,11 @@ #include #include +#include #include #include -#include #include +#include #include #include #include @@ -42,6 +43,7 @@ #include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" +#include "log.h" #include "network-coding.h" #include "originator.h" #include "routing.h" @@ -71,6 +73,7 @@ int batadv_send_skb_packet(struct sk_buff *skb, { struct batadv_priv *bat_priv; struct ethhdr *ethhdr; + int ret; bat_priv = netdev_priv(hard_iface->soft_iface); @@ -108,8 +111,15 @@ int batadv_send_skb_packet(struct sk_buff *skb, /* dev_queue_xmit() returns a negative result on error. However on * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. + * + * a negative value cannot be returned because it could be interepreted + * as not consumed skb by callers of batadv_send_skb_to_orig. */ - return dev_queue_xmit(skb); + ret = dev_queue_xmit(skb); + if (ret < 0) + ret = NET_XMIT_DROP; + + return ret; send_skb_err: kfree_skb(skb); return NET_XMIT_DROP; @@ -155,8 +165,11 @@ int batadv_send_unicast_skb(struct sk_buff *skb, * host, NULL can be passed as recv_if and no interface alternating is * attempted. * - * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or - * NET_XMIT_POLICED if the skb is buffered for later transmit. + * Return: -1 on failure (and the skb is not consumed), -EINPROGRESS if the + * skb is buffered for later transmit or the NET_XMIT status returned by the + * lower routine if the packet has been passed down. + * + * If the returning value is not -1 the skb has been consumed. */ int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, @@ -164,7 +177,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, { struct batadv_priv *bat_priv = orig_node->bat_priv; struct batadv_neigh_node *neigh_node; - int ret = NET_XMIT_DROP; + int ret = -1; /* batadv_find_router() increases neigh_nodes refcount if found. */ neigh_node = batadv_find_router(bat_priv, orig_node, recv_if); @@ -177,8 +190,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, if (atomic_read(&bat_priv->fragmentation) && skb->len > neigh_node->if_incoming->net_dev->mtu) { /* Fragment and send packet. */ - if (batadv_frag_send_packet(skb, orig_node, neigh_node)) - ret = NET_XMIT_SUCCESS; + ret = batadv_frag_send_packet(skb, orig_node, neigh_node); goto out; } @@ -187,12 +199,10 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, * (i.e. being forwarded). If the packet originates from this node or if * network coding fails, then send the packet as usual. */ - if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) { - ret = NET_XMIT_POLICED; - } else { - batadv_send_unicast_skb(skb, neigh_node); - ret = NET_XMIT_SUCCESS; - } + if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) + ret = -EINPROGRESS; + else + ret = batadv_send_unicast_skb(skb, neigh_node); out: if (neigh_node) @@ -318,7 +328,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, { struct batadv_unicast_packet *unicast_packet; struct ethhdr *ethhdr; - int ret = NET_XMIT_DROP; + int res, ret = NET_XMIT_DROP; if (!orig_node) goto out; @@ -355,7 +365,8 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) unicast_packet->ttvn = unicast_packet->ttvn - 1; - if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) + res = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (res != -1) ret = NET_XMIT_SUCCESS; out: @@ -428,27 +439,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, BATADV_P_DATA, orig_node, vid); } -void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface) -{ - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - - if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || - (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) - return; - - /* the interface gets activated here to avoid race conditions between - * the moment of activating the interface in - * hardif_activate_interface() where the originator mac is set and - * outdated packets (especially uninitialized mac addresses) in the - * packet queue - */ - if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) - hard_iface->if_status = BATADV_IF_ACTIVE; - - bat_priv->bat_algo_ops->bat_ogm_schedule(hard_iface); -} - -static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) +void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) { kfree_skb(forw_packet->skb); if (forw_packet->if_incoming) @@ -604,45 +595,6 @@ out: atomic_inc(&bat_priv->bcast_queue_left); } -void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work) -{ - struct delayed_work *delayed_work; - struct batadv_forw_packet *forw_packet; - struct batadv_priv *bat_priv; - - delayed_work = to_delayed_work(work); - forw_packet = container_of(delayed_work, struct batadv_forw_packet, - delayed_work); - bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); - spin_lock_bh(&bat_priv->forw_bat_list_lock); - hlist_del(&forw_packet->list); - spin_unlock_bh(&bat_priv->forw_bat_list_lock); - - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) - goto out; - - bat_priv->bat_algo_ops->bat_ogm_emit(forw_packet); - - /* we have to have at least one packet in the queue to determine the - * queues wake up time unless we are shutting down. - * - * only re-schedule if this is the "original" copy, e.g. the OGM of the - * primary interface should only be rescheduled once per period, but - * this function will be called for the forw_packet instances of the - * other secondary interfaces as well. - */ - if (forw_packet->own && - forw_packet->if_incoming == forw_packet->if_outgoing) - batadv_schedule_bat_ogm(forw_packet->if_incoming); - -out: - /* don't count own packet */ - if (!forw_packet->own) - atomic_inc(&bat_priv->batman_queue_left); - - batadv_forw_packet_free(forw_packet); -} - void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface) diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 6fd7270d8..7cecb7563 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -26,8 +26,8 @@ #include "packet.h" struct sk_buff; -struct work_struct; +void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet); int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); @@ -38,11 +38,9 @@ int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface); int batadv_send_unicast_skb(struct sk_buff *skb, struct batadv_neigh_node *neigh_node); -void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface); int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, const struct sk_buff *skb, unsigned long delay); -void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work); void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 287a3879e..7527c0652 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -48,6 +48,7 @@ #include #include +#include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "debugfs.h" #include "distributed-arp-table.h" @@ -255,7 +256,7 @@ static int batadv_interface_tx(struct sk_buff *skb, if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; - gw_mode = atomic_read(&bat_priv->gw_mode); + gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { @@ -808,6 +809,10 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST + bat_priv->mcast.querier_ipv4.exists = false; + bat_priv->mcast.querier_ipv4.shadowing = false; + bat_priv->mcast.querier_ipv6.exists = false; + bat_priv->mcast.querier_ipv6.shadowing = false; bat_priv->mcast.flags = BATADV_NO_FLAGS; atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->mcast.num_disabled, 0); @@ -815,8 +820,8 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); #endif - atomic_set(&bat_priv->gw_mode, BATADV_GW_MODE_OFF); - atomic_set(&bat_priv->gw_sel_class, 20); + atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); + atomic_set(&bat_priv->gw.sel_class, 20); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); @@ -837,6 +842,8 @@ static int batadv_softif_init_late(struct net_device *dev) #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif + atomic_set(&bat_priv->tp_num, 0); + bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 414b20741..fe9ca94dd 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -38,11 +38,12 @@ #include #include +#include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" -#include "bridge_loop_avoidance.h" #include "hard-interface.h" +#include "log.h" #include "network-coding.h" #include "packet.h" #include "soft-interface.h" @@ -389,12 +390,12 @@ static int batadv_store_uint_attr(const char *buff, size_t count, return count; } -static inline ssize_t -__batadv_store_uint_attr(const char *buff, size_t count, - int min, int max, - void (*post_func)(struct net_device *), - const struct attribute *attr, - atomic_t *attr_store, struct net_device *net_dev) +static ssize_t __batadv_store_uint_attr(const char *buff, size_t count, + int min, int max, + void (*post_func)(struct net_device *), + const struct attribute *attr, + atomic_t *attr_store, + struct net_device *net_dev) { int ret; @@ -411,7 +412,7 @@ static ssize_t batadv_show_bat_algo(struct kobject *kobj, { struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - return sprintf(buff, "%s\n", bat_priv->bat_algo_ops->name); + return sprintf(buff, "%s\n", bat_priv->algo_ops->name); } static void batadv_post_gw_reselect(struct net_device *net_dev) @@ -427,7 +428,7 @@ static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr, struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); int bytes_written; - switch (atomic_read(&bat_priv->gw_mode)) { + switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_CLIENT: bytes_written = sprintf(buff, "%s\n", BATADV_GW_MODE_CLIENT_NAME); @@ -476,10 +477,10 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, return -EINVAL; } - if (atomic_read(&bat_priv->gw_mode) == gw_mode_tmp) + if (atomic_read(&bat_priv->gw.mode) == gw_mode_tmp) return count; - switch (atomic_read(&bat_priv->gw_mode)) { + switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_CLIENT: curr_gw_mode_str = BATADV_GW_MODE_CLIENT_NAME; break; @@ -508,7 +509,7 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, * state */ batadv_gw_check_client_stop(bat_priv); - atomic_set(&bat_priv->gw_mode, (unsigned int)gw_mode_tmp); + atomic_set(&bat_priv->gw.mode, (unsigned int)gw_mode_tmp); batadv_gw_tvlv_container_update(bat_priv); return count; } @@ -624,7 +625,7 @@ BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER, INT_MAX, NULL); BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE, NULL); -BATADV_ATTR_SIF_UINT(gw_sel_class, gw_sel_class, S_IRUGO | S_IWUSR, 1, +BATADV_ATTR_SIF_UINT(gw_sel_class, gw.sel_class, S_IRUGO | S_IWUSR, 1, BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect); static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, batadv_store_gw_bwidth); diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c new file mode 100644 index 000000000..2333777f9 --- /dev/null +++ b/net/batman-adv/tp_meter.c @@ -0,0 +1,1507 @@ +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: + * + * Edo Monticelli, Antonio Quartulli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "tp_meter.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hard-interface.h" +#include "log.h" +#include "netlink.h" +#include "originator.h" +#include "packet.h" +#include "send.h" + +/** + * BATADV_TP_DEF_TEST_LENGTH - Default test length if not specified by the user + * in milliseconds + */ +#define BATADV_TP_DEF_TEST_LENGTH 10000 + +/** + * BATADV_TP_AWND - Advertised window by the receiver (in bytes) + */ +#define BATADV_TP_AWND 0x20000000 + +/** + * BATADV_TP_RECV_TIMEOUT - Receiver activity timeout. If the receiver does not + * get anything for such amount of milliseconds, the connection is killed + */ +#define BATADV_TP_RECV_TIMEOUT 1000 + +/** + * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond + * such amound of milliseconds, the receiver is considered unreachable and the + * connection is killed + */ +#define BATADV_TP_MAX_RTO 30000 + +/** + * BATADV_TP_FIRST_SEQ - First seqno of each session. The number is rather high + * in order to immediately trigger a wrap around (test purposes) + */ +#define BATADV_TP_FIRST_SEQ ((u32)-1 - 2000) + +/** + * BATADV_TP_PLEN - length of the payload (data after the batadv_unicast header) + * to simulate + */ +#define BATADV_TP_PLEN (BATADV_TP_PACKET_LEN - ETH_HLEN - \ + sizeof(struct batadv_unicast_packet)) + +static u8 batadv_tp_prerandom[4096] __read_mostly; + +/** + * batadv_tp_session_cookie - generate session cookie based on session ids + * @session: TP session identifier + * @icmp_uid: icmp pseudo uid of the tp session + * + * Return: 32 bit tp_meter session cookie + */ +static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) +{ + u32 cookie; + + cookie = icmp_uid << 16; + cookie |= session[0] << 8; + cookie |= session[1]; + + return cookie; +} + +/** + * batadv_tp_cwnd - compute the new cwnd size + * @base: base cwnd size value + * @increment: the value to add to base to get the new size + * @min: minumim cwnd value (usually MSS) + * + * Return the new cwnd size and ensures it does not exceed the Advertised + * Receiver Window size. It is wrap around safe. + * For details refer to Section 3.1 of RFC5681 + * + * Return: new congestion window size in bytes + */ +static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min) +{ + u32 new_size = base + increment; + + /* check for wrap-around */ + if (new_size < base) + new_size = (u32)ULONG_MAX; + + new_size = min_t(u32, new_size, BATADV_TP_AWND); + + return max_t(u32, new_size, min); +} + +/** + * batadv_tp_updated_cwnd - update the Congestion Windows + * @tp_vars: the private data of the current TP meter session + * @mss: maximum segment size of transmission + * + * 1) if the session is in Slow Start, the CWND has to be increased by 1 + * MSS every unique received ACK + * 2) if the session is in Congestion Avoidance, the CWND has to be + * increased by MSS * MSS / CWND for every unique received ACK + */ +static void batadv_tp_update_cwnd(struct batadv_tp_vars *tp_vars, u32 mss) +{ + spin_lock_bh(&tp_vars->cwnd_lock); + + /* slow start... */ + if (tp_vars->cwnd <= tp_vars->ss_threshold) { + tp_vars->dec_cwnd = 0; + tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss); + spin_unlock_bh(&tp_vars->cwnd_lock); + return; + } + + /* increment CWND at least of 1 (section 3.1 of RFC5681) */ + tp_vars->dec_cwnd += max_t(u32, 1U << 3, + ((mss * mss) << 6) / (tp_vars->cwnd << 3)); + if (tp_vars->dec_cwnd < (mss << 3)) { + spin_unlock_bh(&tp_vars->cwnd_lock); + return; + } + + tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss); + tp_vars->dec_cwnd = 0; + + spin_unlock_bh(&tp_vars->cwnd_lock); +} + +/** + * batadv_tp_update_rto - calculate new retransmission timeout + * @tp_vars: the private data of the current TP meter session + * @new_rtt: new roundtrip time in msec + */ +static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars, + u32 new_rtt) +{ + long m = new_rtt; + + /* RTT update + * Details in Section 2.2 and 2.3 of RFC6298 + * + * It's tricky to understand. Don't lose hair please. + * Inspired by tcp_rtt_estimator() tcp_input.c + */ + if (tp_vars->srtt != 0) { + m -= (tp_vars->srtt >> 3); /* m is now error in rtt est */ + tp_vars->srtt += m; /* rtt = 7/8 srtt + 1/8 new */ + if (m < 0) + m = -m; + + m -= (tp_vars->rttvar >> 2); + tp_vars->rttvar += m; /* mdev ~= 3/4 rttvar + 1/4 new */ + } else { + /* first measure getting in */ + tp_vars->srtt = m << 3; /* take the measured time to be srtt */ + tp_vars->rttvar = m << 1; /* new_rtt / 2 */ + } + + /* rto = srtt + 4 * rttvar. + * rttvar is scaled by 4, therefore doesn't need to be multiplied + */ + tp_vars->rto = (tp_vars->srtt >> 3) + tp_vars->rttvar; +} + +/** + * batadv_tp_batctl_notify - send client status result to client + * @reason: reason for tp meter session stop + * @dst: destination of tp_meter session + * @bat_priv: the bat priv with all the soft interface information + * @start_time: start of transmission in jiffies + * @total_sent: bytes acked to the receiver + * @cookie: cookie of tp_meter session + */ +static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason, + const u8 *dst, struct batadv_priv *bat_priv, + unsigned long start_time, u64 total_sent, + u32 cookie) +{ + u32 test_time; + u8 result; + u32 total_bytes; + + if (!batadv_tp_is_error(reason)) { + result = BATADV_TP_REASON_COMPLETE; + test_time = jiffies_to_msecs(jiffies - start_time); + total_bytes = total_sent; + } else { + result = reason; + test_time = 0; + total_bytes = 0; + } + + batadv_netlink_tpmeter_notify(bat_priv, dst, result, test_time, + total_bytes, cookie); +} + +/** + * batadv_tp_batctl_error_notify - send client error result to client + * @reason: reason for tp meter session stop + * @dst: destination of tp_meter session + * @bat_priv: the bat priv with all the soft interface information + * @cookie: cookie of tp_meter session + */ +static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, + const u8 *dst, + struct batadv_priv *bat_priv, + u32 cookie) +{ + batadv_tp_batctl_notify(reason, dst, bat_priv, 0, 0, cookie); +} + +/** + * batadv_tp_list_find - find a tp_vars object in the global list + * @bat_priv: the bat priv with all the soft interface information + * @dst: the other endpoint MAC address to look for + * + * Look for a tp_vars object matching dst as end_point and return it after + * having incremented the refcounter. Return NULL is not found + * + * Return: matching tp_vars or NULL when no tp_vars with @dst was found + */ +static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, + const u8 *dst) +{ + struct batadv_tp_vars *pos, *tp_vars = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) { + if (!batadv_compare_eth(pos->other_end, dst)) + continue; + + /* most of the time this function is invoked during the normal + * process..it makes sens to pay more when the session is + * finished and to speed the process up during the measurement + */ + if (unlikely(!kref_get_unless_zero(&pos->refcount))) + continue; + + tp_vars = pos; + break; + } + rcu_read_unlock(); + + return tp_vars; +} + +/** + * batadv_tp_list_find_session - find tp_vars session object in the global list + * @bat_priv: the bat priv with all the soft interface information + * @dst: the other endpoint MAC address to look for + * @session: session identifier + * + * Look for a tp_vars object matching dst as end_point, session as tp meter + * session and return it after having incremented the refcounter. Return NULL + * is not found + * + * Return: matching tp_vars or NULL when no tp_vars was found + */ +static struct batadv_tp_vars * +batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst, + const u8 *session) +{ + struct batadv_tp_vars *pos, *tp_vars = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) { + if (!batadv_compare_eth(pos->other_end, dst)) + continue; + + if (memcmp(pos->session, session, sizeof(pos->session)) != 0) + continue; + + /* most of the time this function is invoked during the normal + * process..it makes sense to pay more when the session is + * finished and to speed the process up during the measurement + */ + if (unlikely(!kref_get_unless_zero(&pos->refcount))) + continue; + + tp_vars = pos; + break; + } + rcu_read_unlock(); + + return tp_vars; +} + +/** + * batadv_tp_vars_release - release batadv_tp_vars from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the batadv_tp_vars + */ +static void batadv_tp_vars_release(struct kref *ref) +{ + struct batadv_tp_vars *tp_vars; + struct batadv_tp_unacked *un, *safe; + + tp_vars = container_of(ref, struct batadv_tp_vars, refcount); + + /* lock should not be needed because this object is now out of any + * context! + */ + spin_lock_bh(&tp_vars->unacked_lock); + list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { + list_del(&un->list); + kfree(un); + } + spin_unlock_bh(&tp_vars->unacked_lock); + + kfree_rcu(tp_vars, rcu); +} + +/** + * batadv_tp_vars_put - decrement the batadv_tp_vars refcounter and possibly + * release it + * @tp_vars: the private data of the current TP meter session to be free'd + */ +static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) +{ + kref_put(&tp_vars->refcount, batadv_tp_vars_release); +} + +/** + * batadv_tp_sender_cleanup - cleanup sender data and drop and timer + * @bat_priv: the bat priv with all the soft interface information + * @tp_vars: the private data of the current TP meter session to cleanup + */ +static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, + struct batadv_tp_vars *tp_vars) +{ + cancel_delayed_work(&tp_vars->finish_work); + + spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); + hlist_del_rcu(&tp_vars->list); + spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); + + /* drop list reference */ + batadv_tp_vars_put(tp_vars); + + atomic_dec(&tp_vars->bat_priv->tp_num); + + /* kill the timer and remove its reference */ + del_timer_sync(&tp_vars->timer); + /* the worker might have rearmed itself therefore we kill it again. Note + * that if the worker should run again before invoking the following + * del_timer(), it would not re-arm itself once again because the status + * is OFF now + */ + del_timer(&tp_vars->timer); + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_sender_end - print info about ended session and inform client + * @bat_priv: the bat priv with all the soft interface information + * @tp_vars: the private data of the current TP meter session + */ +static void batadv_tp_sender_end(struct batadv_priv *bat_priv, + struct batadv_tp_vars *tp_vars) +{ + u32 session_cookie; + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Test towards %pM finished..shutting down (reason=%d)\n", + tp_vars->other_end, tp_vars->reason); + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Last timing stats: SRTT=%ums RTTVAR=%ums RTO=%ums\n", + tp_vars->srtt >> 3, tp_vars->rttvar >> 2, tp_vars->rto); + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Final values: cwnd=%u ss_threshold=%u\n", + tp_vars->cwnd, tp_vars->ss_threshold); + + session_cookie = batadv_tp_session_cookie(tp_vars->session, + tp_vars->icmp_uid); + + batadv_tp_batctl_notify(tp_vars->reason, + tp_vars->other_end, + bat_priv, + tp_vars->start_time, + atomic64_read(&tp_vars->tot_sent), + session_cookie); +} + +/** + * batadv_tp_sender_shutdown - let sender thread/timer stop gracefully + * @tp_vars: the private data of the current TP meter session + * @reason: reason for tp meter session stop + */ +static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars, + enum batadv_tp_meter_reason reason) +{ + if (!atomic_dec_and_test(&tp_vars->sending)) + return; + + tp_vars->reason = reason; +} + +/** + * batadv_tp_sender_finish - stop sender session after test_length was reached + * @work: delayed work reference of the related tp_vars + */ +static void batadv_tp_sender_finish(struct work_struct *work) +{ + struct delayed_work *delayed_work; + struct batadv_tp_vars *tp_vars; + + delayed_work = to_delayed_work(work); + tp_vars = container_of(delayed_work, struct batadv_tp_vars, + finish_work); + + batadv_tp_sender_shutdown(tp_vars, BATADV_TP_REASON_COMPLETE); +} + +/** + * batadv_tp_reset_sender_timer - reschedule the sender timer + * @tp_vars: the private TP meter data for this session + * + * Reschedule the timer using tp_vars->rto as delay + */ +static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) +{ + /* most of the time this function is invoked while normal packet + * reception... + */ + if (unlikely(atomic_read(&tp_vars->sending) == 0)) + /* timer ref will be dropped in batadv_tp_sender_cleanup */ + return; + + mod_timer(&tp_vars->timer, jiffies + msecs_to_jiffies(tp_vars->rto)); +} + +/** + * batadv_tp_sender_timeout - timer that fires in case of packet loss + * @arg: address of the related tp_vars + * + * If fired it means that there was packet loss. + * Switch to Slow Start, set the ss_threshold to half of the current cwnd and + * reset the cwnd to 3*MSS + */ +static void batadv_tp_sender_timeout(unsigned long arg) +{ + struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg; + struct batadv_priv *bat_priv = tp_vars->bat_priv; + + if (atomic_read(&tp_vars->sending) == 0) + return; + + /* if the user waited long enough...shutdown the test */ + if (unlikely(tp_vars->rto >= BATADV_TP_MAX_RTO)) { + batadv_tp_sender_shutdown(tp_vars, + BATADV_TP_REASON_DST_UNREACHABLE); + return; + } + + /* RTO exponential backoff + * Details in Section 5.5 of RFC6298 + */ + tp_vars->rto <<= 1; + + spin_lock_bh(&tp_vars->cwnd_lock); + + tp_vars->ss_threshold = tp_vars->cwnd >> 1; + if (tp_vars->ss_threshold < BATADV_TP_PLEN * 2) + tp_vars->ss_threshold = BATADV_TP_PLEN * 2; + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: RTO fired during test towards %pM! cwnd=%u new ss_thr=%u, resetting last_sent to %u\n", + tp_vars->other_end, tp_vars->cwnd, tp_vars->ss_threshold, + atomic_read(&tp_vars->last_acked)); + + tp_vars->cwnd = BATADV_TP_PLEN * 3; + + spin_unlock_bh(&tp_vars->cwnd_lock); + + /* resend the non-ACKed packets.. */ + tp_vars->last_sent = atomic_read(&tp_vars->last_acked); + wake_up(&tp_vars->more_bytes); + + batadv_tp_reset_sender_timer(tp_vars); +} + +/** + * batadv_tp_fill_prerandom - Fill buffer with prefetched random bytes + * @tp_vars: the private TP meter data for this session + * @buf: Buffer to fill with bytes + * @nbytes: amount of pseudorandom bytes + */ +static void batadv_tp_fill_prerandom(struct batadv_tp_vars *tp_vars, + u8 *buf, size_t nbytes) +{ + u32 local_offset; + size_t bytes_inbuf; + size_t to_copy; + size_t pos = 0; + + spin_lock_bh(&tp_vars->prerandom_lock); + local_offset = tp_vars->prerandom_offset; + tp_vars->prerandom_offset += nbytes; + tp_vars->prerandom_offset %= sizeof(batadv_tp_prerandom); + spin_unlock_bh(&tp_vars->prerandom_lock); + + while (nbytes) { + local_offset %= sizeof(batadv_tp_prerandom); + bytes_inbuf = sizeof(batadv_tp_prerandom) - local_offset; + to_copy = min(nbytes, bytes_inbuf); + + memcpy(&buf[pos], &batadv_tp_prerandom[local_offset], to_copy); + pos += to_copy; + nbytes -= to_copy; + local_offset = 0; + } +} + +/** + * batadv_tp_send_msg - send a single message + * @tp_vars: the private TP meter data for this session + * @src: source mac address + * @orig_node: the originator of the destination + * @seqno: sequence number of this packet + * @len: length of the entire packet + * @session: session identifier + * @uid: local ICMP "socket" index + * @timestamp: timestamp in jiffies which is replied in ack + * + * Create and send a single TP Meter message. + * + * Return: 0 on success, BATADV_TP_REASON_DST_UNREACHABLE if the destination is + * not reachable, BATADV_TP_REASON_MEMORY_ERROR if the packet couldn't be + * allocated + */ +static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src, + struct batadv_orig_node *orig_node, + u32 seqno, size_t len, const u8 *session, + int uid, u32 timestamp) +{ + struct batadv_icmp_tp_packet *icmp; + struct sk_buff *skb; + int r; + u8 *data; + size_t data_len; + + skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN); + if (unlikely(!skb)) + return BATADV_TP_REASON_MEMORY_ERROR; + + skb_reserve(skb, ETH_HLEN); + icmp = (struct batadv_icmp_tp_packet *)skb_put(skb, sizeof(*icmp)); + + /* fill the icmp header */ + ether_addr_copy(icmp->dst, orig_node->orig); + ether_addr_copy(icmp->orig, src); + icmp->version = BATADV_COMPAT_VERSION; + icmp->packet_type = BATADV_ICMP; + icmp->ttl = BATADV_TTL; + icmp->msg_type = BATADV_TP; + icmp->uid = uid; + + icmp->subtype = BATADV_TP_MSG; + memcpy(icmp->session, session, sizeof(icmp->session)); + icmp->seqno = htonl(seqno); + icmp->timestamp = htonl(timestamp); + + data_len = len - sizeof(*icmp); + data = (u8 *)skb_put(skb, data_len); + batadv_tp_fill_prerandom(tp_vars, data, data_len); + + r = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (r == -1) + kfree_skb(skb); + + if (r == NET_XMIT_SUCCESS) + return 0; + + return BATADV_TP_REASON_CANT_SEND; +} + +/** + * batadv_tp_recv_ack - ACK receiving function + * @bat_priv: the bat priv with all the soft interface information + * @skb: the buffer containing the received packet + * + * Process a received TP ACK packet + */ +static void batadv_tp_recv_ack(struct batadv_priv *bat_priv, + const struct sk_buff *skb) +{ + struct batadv_hard_iface *primary_if = NULL; + struct batadv_orig_node *orig_node = NULL; + const struct batadv_icmp_tp_packet *icmp; + struct batadv_tp_vars *tp_vars; + size_t packet_len, mss; + u32 rtt, recv_ack, cwnd; + unsigned char *dev_addr; + + packet_len = BATADV_TP_PLEN; + mss = BATADV_TP_PLEN; + packet_len += sizeof(struct batadv_unicast_packet); + + icmp = (struct batadv_icmp_tp_packet *)skb->data; + + /* find the tp_vars */ + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, + icmp->session); + if (unlikely(!tp_vars)) + return; + + if (unlikely(atomic_read(&tp_vars->sending) == 0)) + goto out; + + /* old ACK? silently drop it.. */ + if (batadv_seq_before(ntohl(icmp->seqno), + (u32)atomic_read(&tp_vars->last_acked))) + goto out; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (unlikely(!primary_if)) + goto out; + + orig_node = batadv_orig_hash_find(bat_priv, icmp->orig); + if (unlikely(!orig_node)) + goto out; + + /* update RTO with the new sampled RTT, if any */ + rtt = jiffies_to_msecs(jiffies) - ntohl(icmp->timestamp); + if (icmp->timestamp && rtt) + batadv_tp_update_rto(tp_vars, rtt); + + /* ACK for new data... reset the timer */ + batadv_tp_reset_sender_timer(tp_vars); + + recv_ack = ntohl(icmp->seqno); + + /* check if this ACK is a duplicate */ + if (atomic_read(&tp_vars->last_acked) == recv_ack) { + atomic_inc(&tp_vars->dup_acks); + if (atomic_read(&tp_vars->dup_acks) != 3) + goto out; + + if (recv_ack >= tp_vars->recover) + goto out; + + /* if this is the third duplicate ACK do Fast Retransmit */ + batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr, + orig_node, recv_ack, packet_len, + icmp->session, icmp->uid, + jiffies_to_msecs(jiffies)); + + spin_lock_bh(&tp_vars->cwnd_lock); + + /* Fast Recovery */ + tp_vars->fast_recovery = true; + /* Set recover to the last outstanding seqno when Fast Recovery + * is entered. RFC6582, Section 3.2, step 1 + */ + tp_vars->recover = tp_vars->last_sent; + tp_vars->ss_threshold = tp_vars->cwnd >> 1; + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: Fast Recovery, (cur cwnd=%u) ss_thr=%u last_sent=%u recv_ack=%u\n", + tp_vars->cwnd, tp_vars->ss_threshold, + tp_vars->last_sent, recv_ack); + tp_vars->cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 3 * mss, + mss); + tp_vars->dec_cwnd = 0; + tp_vars->last_sent = recv_ack; + + spin_unlock_bh(&tp_vars->cwnd_lock); + } else { + /* count the acked data */ + atomic64_add(recv_ack - atomic_read(&tp_vars->last_acked), + &tp_vars->tot_sent); + /* reset the duplicate ACKs counter */ + atomic_set(&tp_vars->dup_acks, 0); + + if (tp_vars->fast_recovery) { + /* partial ACK */ + if (batadv_seq_before(recv_ack, tp_vars->recover)) { + /* this is another hole in the window. React + * immediately as specified by NewReno (see + * Section 3.2 of RFC6582 for details) + */ + dev_addr = primary_if->net_dev->dev_addr; + batadv_tp_send_msg(tp_vars, dev_addr, + orig_node, recv_ack, + packet_len, icmp->session, + icmp->uid, + jiffies_to_msecs(jiffies)); + tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, + mss, mss); + } else { + tp_vars->fast_recovery = false; + /* set cwnd to the value of ss_threshold at the + * moment that Fast Recovery was entered. + * RFC6582, Section 3.2, step 3 + */ + cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 0, + mss); + tp_vars->cwnd = cwnd; + } + goto move_twnd; + } + + if (recv_ack - atomic_read(&tp_vars->last_acked) >= mss) + batadv_tp_update_cwnd(tp_vars, mss); +move_twnd: + /* move the Transmit Window */ + atomic_set(&tp_vars->last_acked, recv_ack); + } + + wake_up(&tp_vars->more_bytes); +out: + if (likely(primary_if)) + batadv_hardif_put(primary_if); + if (likely(orig_node)) + batadv_orig_node_put(orig_node); + if (likely(tp_vars)) + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_avail - check if congestion window is not full + * @tp_vars: the private data of the current TP meter session + * @payload_len: size of the payload of a single message + * + * Return: true when congestion window is not full, false otherwise + */ +static bool batadv_tp_avail(struct batadv_tp_vars *tp_vars, + size_t payload_len) +{ + u32 win_left, win_limit; + + win_limit = atomic_read(&tp_vars->last_acked) + tp_vars->cwnd; + win_left = win_limit - tp_vars->last_sent; + + return win_left >= payload_len; +} + +/** + * batadv_tp_wait_available - wait until congestion window becomes free or + * timeout is reached + * @tp_vars: the private data of the current TP meter session + * @plen: size of the payload of a single message + * + * Return: 0 if the condition evaluated to false after the timeout elapsed, + * 1 if the condition evaluated to true after the timeout elapsed, the + * remaining jiffies (at least 1) if the condition evaluated to true before + * the timeout elapsed, or -ERESTARTSYS if it was interrupted by a signal. + */ +static int batadv_tp_wait_available(struct batadv_tp_vars *tp_vars, size_t plen) +{ + int ret; + + ret = wait_event_interruptible_timeout(tp_vars->more_bytes, + batadv_tp_avail(tp_vars, plen), + HZ / 10); + + return ret; +} + +/** + * batadv_tp_send - main sending thread of a tp meter session + * @arg: address of the related tp_vars + * + * Return: nothing, this function never returns + */ +static int batadv_tp_send(void *arg) +{ + struct batadv_tp_vars *tp_vars = arg; + struct batadv_priv *bat_priv = tp_vars->bat_priv; + struct batadv_hard_iface *primary_if = NULL; + struct batadv_orig_node *orig_node = NULL; + size_t payload_len, packet_len; + int err = 0; + + if (unlikely(tp_vars->role != BATADV_TP_SENDER)) { + err = BATADV_TP_REASON_DST_UNREACHABLE; + tp_vars->reason = err; + goto out; + } + + orig_node = batadv_orig_hash_find(bat_priv, tp_vars->other_end); + if (unlikely(!orig_node)) { + err = BATADV_TP_REASON_DST_UNREACHABLE; + tp_vars->reason = err; + goto out; + } + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (unlikely(!primary_if)) { + err = BATADV_TP_REASON_DST_UNREACHABLE; + goto out; + } + + /* assume that all the hard_interfaces have a correctly + * configured MTU, so use the soft_iface MTU as MSS. + * This might not be true and in that case the fragmentation + * should be used. + * Now, try to send the packet as it is + */ + payload_len = BATADV_TP_PLEN; + BUILD_BUG_ON(sizeof(struct batadv_icmp_tp_packet) > BATADV_TP_PLEN); + + batadv_tp_reset_sender_timer(tp_vars); + + /* queue the worker in charge of terminating the test */ + queue_delayed_work(batadv_event_workqueue, &tp_vars->finish_work, + msecs_to_jiffies(tp_vars->test_length)); + + while (atomic_read(&tp_vars->sending) != 0) { + if (unlikely(!batadv_tp_avail(tp_vars, payload_len))) { + batadv_tp_wait_available(tp_vars, payload_len); + continue; + } + + /* to emulate normal unicast traffic, add to the payload len + * the size of the unicast header + */ + packet_len = payload_len + sizeof(struct batadv_unicast_packet); + + err = batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr, + orig_node, tp_vars->last_sent, + packet_len, + tp_vars->session, tp_vars->icmp_uid, + jiffies_to_msecs(jiffies)); + + /* something went wrong during the preparation/transmission */ + if (unlikely(err && err != BATADV_TP_REASON_CANT_SEND)) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: batadv_tp_send() cannot send packets (%d)\n", + err); + /* ensure nobody else tries to stop the thread now */ + if (atomic_dec_and_test(&tp_vars->sending)) + tp_vars->reason = err; + break; + } + + /* right-shift the TWND */ + if (!err) + tp_vars->last_sent += payload_len; + + cond_resched(); + } + +out: + if (likely(primary_if)) + batadv_hardif_put(primary_if); + if (likely(orig_node)) + batadv_orig_node_put(orig_node); + + batadv_tp_sender_end(bat_priv, tp_vars); + batadv_tp_sender_cleanup(bat_priv, tp_vars); + + batadv_tp_vars_put(tp_vars); + + do_exit(0); +} + +/** + * batadv_tp_start_kthread - start new thread which manages the tp meter sender + * @tp_vars: the private data of the current TP meter session + */ +static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) +{ + struct task_struct *kthread; + struct batadv_priv *bat_priv = tp_vars->bat_priv; + u32 session_cookie; + + kref_get(&tp_vars->refcount); + kthread = kthread_create(batadv_tp_send, tp_vars, "kbatadv_tp_meter"); + if (IS_ERR(kthread)) { + session_cookie = batadv_tp_session_cookie(tp_vars->session, + tp_vars->icmp_uid); + pr_err("batadv: cannot create tp meter kthread\n"); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR, + tp_vars->other_end, + bat_priv, session_cookie); + + /* drop reserved reference for kthread */ + batadv_tp_vars_put(tp_vars); + + /* cleanup of failed tp meter variables */ + batadv_tp_sender_cleanup(bat_priv, tp_vars); + return; + } + + wake_up_process(kthread); +} + +/** + * batadv_tp_start - start a new tp meter session + * @bat_priv: the bat priv with all the soft interface information + * @dst: the receiver MAC address + * @test_length: test length in milliseconds + * @cookie: session cookie + */ +void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, + u32 test_length, u32 *cookie) +{ + struct batadv_tp_vars *tp_vars; + u8 session_id[2]; + u8 icmp_uid; + u32 session_cookie; + + get_random_bytes(session_id, sizeof(session_id)); + get_random_bytes(&icmp_uid, 1); + session_cookie = batadv_tp_session_cookie(session_id, icmp_uid); + *cookie = session_cookie; + + /* look for an already existing test towards this node */ + spin_lock_bh(&bat_priv->tp_list_lock); + tp_vars = batadv_tp_list_find(bat_priv, dst); + if (tp_vars) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_tp_vars_put(tp_vars); + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: test to or from the same node already ongoing, aborting\n"); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_ALREADY_ONGOING, + dst, bat_priv, session_cookie); + return; + } + + if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: too many ongoing sessions, aborting (SEND)\n"); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_TOO_MANY, dst, + bat_priv, session_cookie); + return; + } + + tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC); + if (!tp_vars) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: batadv_tp_start cannot allocate list elements\n"); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR, + dst, bat_priv, session_cookie); + return; + } + + /* initialize tp_vars */ + ether_addr_copy(tp_vars->other_end, dst); + kref_init(&tp_vars->refcount); + tp_vars->role = BATADV_TP_SENDER; + atomic_set(&tp_vars->sending, 1); + memcpy(tp_vars->session, session_id, sizeof(session_id)); + tp_vars->icmp_uid = icmp_uid; + + tp_vars->last_sent = BATADV_TP_FIRST_SEQ; + atomic_set(&tp_vars->last_acked, BATADV_TP_FIRST_SEQ); + tp_vars->fast_recovery = false; + tp_vars->recover = BATADV_TP_FIRST_SEQ; + + /* initialise the CWND to 3*MSS (Section 3.1 in RFC5681). + * For batman-adv the MSS is the size of the payload received by the + * soft_interface, hence its MTU + */ + tp_vars->cwnd = BATADV_TP_PLEN * 3; + /* at the beginning initialise the SS threshold to the biggest possible + * window size, hence the AWND size + */ + tp_vars->ss_threshold = BATADV_TP_AWND; + + /* RTO initial value is 3 seconds. + * Details in Section 2.1 of RFC6298 + */ + tp_vars->rto = 1000; + tp_vars->srtt = 0; + tp_vars->rttvar = 0; + + atomic64_set(&tp_vars->tot_sent, 0); + + kref_get(&tp_vars->refcount); + setup_timer(&tp_vars->timer, batadv_tp_sender_timeout, + (unsigned long)tp_vars); + + tp_vars->bat_priv = bat_priv; + tp_vars->start_time = jiffies; + + init_waitqueue_head(&tp_vars->more_bytes); + + spin_lock_init(&tp_vars->unacked_lock); + INIT_LIST_HEAD(&tp_vars->unacked_list); + + spin_lock_init(&tp_vars->cwnd_lock); + + tp_vars->prerandom_offset = 0; + spin_lock_init(&tp_vars->prerandom_lock); + + kref_get(&tp_vars->refcount); + hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list); + spin_unlock_bh(&bat_priv->tp_list_lock); + + tp_vars->test_length = test_length; + if (!tp_vars->test_length) + tp_vars->test_length = BATADV_TP_DEF_TEST_LENGTH; + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: starting throughput meter towards %pM (length=%ums)\n", + dst, test_length); + + /* init work item for finished tp tests */ + INIT_DELAYED_WORK(&tp_vars->finish_work, batadv_tp_sender_finish); + + /* start tp kthread. This way the write() call issued from userspace can + * happily return and avoid to block + */ + batadv_tp_start_kthread(tp_vars); + + /* don't return reference to new tp_vars */ + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_stop - stop currently running tp meter session + * @bat_priv: the bat priv with all the soft interface information + * @dst: the receiver MAC address + * @return_value: reason for tp meter session stop + */ +void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, + u8 return_value) +{ + struct batadv_orig_node *orig_node; + struct batadv_tp_vars *tp_vars; + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: stopping test towards %pM\n", dst); + + orig_node = batadv_orig_hash_find(bat_priv, dst); + if (!orig_node) + return; + + tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig); + if (!tp_vars) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: trying to interrupt an already over connection\n"); + goto out; + } + + batadv_tp_sender_shutdown(tp_vars, return_value); + batadv_tp_vars_put(tp_vars); +out: + batadv_orig_node_put(orig_node); +} + +/** + * batadv_tp_reset_receiver_timer - reset the receiver shutdown timer + * @tp_vars: the private data of the current TP meter session + * + * start the receiver shutdown timer or reset it if already started + */ +static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) +{ + mod_timer(&tp_vars->timer, + jiffies + msecs_to_jiffies(BATADV_TP_RECV_TIMEOUT)); +} + +/** + * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is + * reached without received ack + * @arg: address of the related tp_vars + */ +static void batadv_tp_receiver_shutdown(unsigned long arg) +{ + struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg; + struct batadv_tp_unacked *un, *safe; + struct batadv_priv *bat_priv; + + bat_priv = tp_vars->bat_priv; + + /* if there is recent activity rearm the timer */ + if (!batadv_has_timed_out(tp_vars->last_recv_time, + BATADV_TP_RECV_TIMEOUT)) { + /* reset the receiver shutdown timer */ + batadv_tp_reset_receiver_timer(tp_vars); + return; + } + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Shutting down for inactivity (more than %dms) from %pM\n", + BATADV_TP_RECV_TIMEOUT, tp_vars->other_end); + + spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); + hlist_del_rcu(&tp_vars->list); + spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); + + /* drop list reference */ + batadv_tp_vars_put(tp_vars); + + atomic_dec(&bat_priv->tp_num); + + spin_lock_bh(&tp_vars->unacked_lock); + list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { + list_del(&un->list); + kfree(un); + } + spin_unlock_bh(&tp_vars->unacked_lock); + + /* drop reference of timer */ + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_send_ack - send an ACK packet + * @bat_priv: the bat priv with all the soft interface information + * @dst: the mac address of the destination originator + * @seq: the sequence number to ACK + * @timestamp: the timestamp to echo back in the ACK + * @session: session identifier + * @socket_index: local ICMP socket identifier + * + * Return: 0 on success, a positive integer representing the reason of the + * failure otherwise + */ +static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst, + u32 seq, __be32 timestamp, const u8 *session, + int socket_index) +{ + struct batadv_hard_iface *primary_if = NULL; + struct batadv_orig_node *orig_node; + struct batadv_icmp_tp_packet *icmp; + struct sk_buff *skb; + int r, ret; + + orig_node = batadv_orig_hash_find(bat_priv, dst); + if (unlikely(!orig_node)) { + ret = BATADV_TP_REASON_DST_UNREACHABLE; + goto out; + } + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (unlikely(!primary_if)) { + ret = BATADV_TP_REASON_DST_UNREACHABLE; + goto out; + } + + skb = netdev_alloc_skb_ip_align(NULL, sizeof(*icmp) + ETH_HLEN); + if (unlikely(!skb)) { + ret = BATADV_TP_REASON_MEMORY_ERROR; + goto out; + } + + skb_reserve(skb, ETH_HLEN); + icmp = (struct batadv_icmp_tp_packet *)skb_put(skb, sizeof(*icmp)); + icmp->packet_type = BATADV_ICMP; + icmp->version = BATADV_COMPAT_VERSION; + icmp->ttl = BATADV_TTL; + icmp->msg_type = BATADV_TP; + ether_addr_copy(icmp->dst, orig_node->orig); + ether_addr_copy(icmp->orig, primary_if->net_dev->dev_addr); + icmp->uid = socket_index; + + icmp->subtype = BATADV_TP_ACK; + memcpy(icmp->session, session, sizeof(icmp->session)); + icmp->seqno = htonl(seq); + icmp->timestamp = timestamp; + + /* send the ack */ + r = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (r == -1) + kfree_skb(skb); + + if (unlikely(r < 0) || (r == NET_XMIT_DROP)) { + ret = BATADV_TP_REASON_DST_UNREACHABLE; + goto out; + } + ret = 0; + +out: + if (likely(orig_node)) + batadv_orig_node_put(orig_node); + if (likely(primary_if)) + batadv_hardif_put(primary_if); + + return ret; +} + +/** + * batadv_tp_handle_out_of_order - store an out of order packet + * @tp_vars: the private data of the current TP meter session + * @skb: the buffer containing the received packet + * + * Store the out of order packet in the unacked list for late processing. This + * packets are kept in this list so that they can be ACKed at once as soon as + * all the previous packets have been received + * + * Return: true if the packed has been successfully processed, false otherwise + */ +static bool batadv_tp_handle_out_of_order(struct batadv_tp_vars *tp_vars, + const struct sk_buff *skb) +{ + const struct batadv_icmp_tp_packet *icmp; + struct batadv_tp_unacked *un, *new; + u32 payload_len; + bool added = false; + + new = kmalloc(sizeof(*new), GFP_ATOMIC); + if (unlikely(!new)) + return false; + + icmp = (struct batadv_icmp_tp_packet *)skb->data; + + new->seqno = ntohl(icmp->seqno); + payload_len = skb->len - sizeof(struct batadv_unicast_packet); + new->len = payload_len; + + spin_lock_bh(&tp_vars->unacked_lock); + /* if the list is empty immediately attach this new object */ + if (list_empty(&tp_vars->unacked_list)) { + list_add(&new->list, &tp_vars->unacked_list); + goto out; + } + + /* otherwise loop over the list and either drop the packet because this + * is a duplicate or store it at the right position. + * + * The iteration is done in the reverse way because it is likely that + * the last received packet (the one being processed now) has a bigger + * seqno than all the others already stored. + */ + list_for_each_entry_reverse(un, &tp_vars->unacked_list, list) { + /* check for duplicates */ + if (new->seqno == un->seqno) { + if (new->len > un->len) + un->len = new->len; + kfree(new); + added = true; + break; + } + + /* look for the right position */ + if (batadv_seq_before(new->seqno, un->seqno)) + continue; + + /* as soon as an entry having a bigger seqno is found, the new + * one is attached _after_ it. In this way the list is kept in + * ascending order + */ + list_add_tail(&new->list, &un->list); + added = true; + break; + } + + /* received packet with smallest seqno out of order; add it to front */ + if (!added) + list_add(&new->list, &tp_vars->unacked_list); + +out: + spin_unlock_bh(&tp_vars->unacked_lock); + + return true; +} + +/** + * batadv_tp_ack_unordered - update number received bytes in current stream + * without gaps + * @tp_vars: the private data of the current TP meter session + */ +static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars) +{ + struct batadv_tp_unacked *un, *safe; + u32 to_ack; + + /* go through the unacked packet list and possibly ACK them as + * well + */ + spin_lock_bh(&tp_vars->unacked_lock); + list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { + /* the list is ordered, therefore it is possible to stop as soon + * there is a gap between the last acked seqno and the seqno of + * the packet under inspection + */ + if (batadv_seq_before(tp_vars->last_recv, un->seqno)) + break; + + to_ack = un->seqno + un->len - tp_vars->last_recv; + + if (batadv_seq_before(tp_vars->last_recv, un->seqno + un->len)) + tp_vars->last_recv += to_ack; + + list_del(&un->list); + kfree(un); + } + spin_unlock_bh(&tp_vars->unacked_lock); +} + +/** + * batadv_tp_init_recv - return matching or create new receiver tp_vars + * @bat_priv: the bat priv with all the soft interface information + * @icmp: received icmp tp msg + * + * Return: corresponding tp_vars or NULL on errors + */ +static struct batadv_tp_vars * +batadv_tp_init_recv(struct batadv_priv *bat_priv, + const struct batadv_icmp_tp_packet *icmp) +{ + struct batadv_tp_vars *tp_vars; + + spin_lock_bh(&bat_priv->tp_list_lock); + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, + icmp->session); + if (tp_vars) + goto out_unlock; + + if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: too many ongoing sessions, aborting (RECV)\n"); + goto out_unlock; + } + + tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC); + if (!tp_vars) + goto out_unlock; + + ether_addr_copy(tp_vars->other_end, icmp->orig); + tp_vars->role = BATADV_TP_RECEIVER; + memcpy(tp_vars->session, icmp->session, sizeof(tp_vars->session)); + tp_vars->last_recv = BATADV_TP_FIRST_SEQ; + tp_vars->bat_priv = bat_priv; + kref_init(&tp_vars->refcount); + + spin_lock_init(&tp_vars->unacked_lock); + INIT_LIST_HEAD(&tp_vars->unacked_list); + + kref_get(&tp_vars->refcount); + hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list); + + kref_get(&tp_vars->refcount); + setup_timer(&tp_vars->timer, batadv_tp_receiver_shutdown, + (unsigned long)tp_vars); + + batadv_tp_reset_receiver_timer(tp_vars); + +out_unlock: + spin_unlock_bh(&bat_priv->tp_list_lock); + + return tp_vars; +} + +/** + * batadv_tp_recv_msg - process a single data message + * @bat_priv: the bat priv with all the soft interface information + * @skb: the buffer containing the received packet + * + * Process a received TP MSG packet + */ +static void batadv_tp_recv_msg(struct batadv_priv *bat_priv, + const struct sk_buff *skb) +{ + const struct batadv_icmp_tp_packet *icmp; + struct batadv_tp_vars *tp_vars; + size_t packet_size; + u32 seqno; + + icmp = (struct batadv_icmp_tp_packet *)skb->data; + + seqno = ntohl(icmp->seqno); + /* check if this is the first seqno. This means that if the + * first packet is lost, the tp meter does not work anymore! + */ + if (seqno == BATADV_TP_FIRST_SEQ) { + tp_vars = batadv_tp_init_recv(bat_priv, icmp); + if (!tp_vars) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: seqno != BATADV_TP_FIRST_SEQ cannot initiate connection\n"); + goto out; + } + } else { + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, + icmp->session); + if (!tp_vars) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Unexpected packet from %pM!\n", + icmp->orig); + goto out; + } + } + + if (unlikely(tp_vars->role != BATADV_TP_RECEIVER)) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: dropping packet: not expected (role=%u)\n", + tp_vars->role); + goto out; + } + + tp_vars->last_recv_time = jiffies; + + /* if the packet is a duplicate, it may be the case that an ACK has been + * lost. Resend the ACK + */ + if (batadv_seq_before(seqno, tp_vars->last_recv)) + goto send_ack; + + /* if the packet is out of order enqueue it */ + if (ntohl(icmp->seqno) != tp_vars->last_recv) { + /* exit immediately (and do not send any ACK) if the packet has + * not been enqueued correctly + */ + if (!batadv_tp_handle_out_of_order(tp_vars, skb)) + goto out; + + /* send a duplicate ACK */ + goto send_ack; + } + + /* if everything was fine count the ACKed bytes */ + packet_size = skb->len - sizeof(struct batadv_unicast_packet); + tp_vars->last_recv += packet_size; + + /* check if this ordered message filled a gap.... */ + batadv_tp_ack_unordered(tp_vars); + +send_ack: + /* send the ACK. If the received packet was out of order, the ACK that + * is going to be sent is a duplicate (the sender will count them and + * possibly enter Fast Retransmit as soon as it has reached 3) + */ + batadv_tp_send_ack(bat_priv, icmp->orig, tp_vars->last_recv, + icmp->timestamp, icmp->session, icmp->uid); +out: + if (likely(tp_vars)) + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_meter_recv - main TP Meter receiving function + * @bat_priv: the bat priv with all the soft interface information + * @skb: the buffer containing the received packet + */ +void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) +{ + struct batadv_icmp_tp_packet *icmp; + + icmp = (struct batadv_icmp_tp_packet *)skb->data; + + switch (icmp->subtype) { + case BATADV_TP_MSG: + batadv_tp_recv_msg(bat_priv, skb); + break; + case BATADV_TP_ACK: + batadv_tp_recv_ack(bat_priv, skb); + break; + default: + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Received unknown TP Metric packet type %u\n", + icmp->subtype); + } + consume_skb(skb); +} + +/** + * batadv_tp_meter_init - initialize global tp_meter structures + */ +void batadv_tp_meter_init(void) +{ + get_random_bytes(batadv_tp_prerandom, sizeof(batadv_tp_prerandom)); +} diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h new file mode 100644 index 000000000..ba922c425 --- /dev/null +++ b/net/batman-adv/tp_meter.h @@ -0,0 +1,34 @@ +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: + * + * Edo Monticelli, Antonio Quartulli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_TP_METER_H_ +#define _NET_BATMAN_ADV_TP_METER_H_ + +#include "main.h" + +#include + +struct sk_buff; + +void batadv_tp_meter_init(void); +void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, + u32 test_length, u32 *cookie); +void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, + u8 return_value); +void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb); + +#endif /* _NET_BATMAN_ADV_TP_METER_H_ */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 57ec87f37..7e6df7a49 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -47,10 +47,12 @@ #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "multicast.h" #include "originator.h" #include "packet.h" #include "soft-interface.h" +#include "tvlv.h" /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; @@ -996,7 +998,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_local_entry *tt_local; struct batadv_hard_iface *primary_if; struct hlist_head *head; - unsigned short vid; u32 i; int last_seen_secs; int last_seen_msecs; @@ -1023,7 +1024,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) tt_local = container_of(tt_common_entry, struct batadv_tt_local_entry, common); - vid = tt_common_entry->vid; last_seen_jiffies = jiffies - tt_local->last_seen; last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); last_seen_secs = last_seen_msecs / 1000; @@ -1547,7 +1547,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry) { struct batadv_neigh_node *router, *best_router = NULL; - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL; @@ -1559,8 +1559,8 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, continue; if (best_router && - bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT, - best_router, BATADV_IF_DEFAULT) <= 0) { + bao->neigh.cmp(router, BATADV_IF_DEFAULT, best_router, + BATADV_IF_DEFAULT) <= 0) { batadv_neigh_node_put(router); continue; } diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c new file mode 100644 index 000000000..3d1cf0fb1 --- /dev/null +++ b/net/batman-adv/tvlv.c @@ -0,0 +1,632 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "originator.h" +#include "packet.h" +#include "send.h" +#include "tvlv.h" + +/** + * batadv_tvlv_handler_release - release tvlv handler from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_handler_release(struct kref *ref) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); + kfree_rcu(tvlv_handler, rcu); +} + +/** + * batadv_tvlv_handler_put - decrement the tvlv container refcounter and + * possibly release it + * @tvlv_handler: the tvlv handler to free + */ +static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) +{ + kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); +} + +/** + * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list + * based on the provided type and version (both need to match) + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv handler type to look for + * @version: tvlv handler version to look for + * + * Return: tvlv handler if found or NULL otherwise. + */ +static struct batadv_tvlv_handler * +batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) +{ + struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tvlv_handler_tmp, + &bat_priv->tvlv.handler_list, list) { + if (tvlv_handler_tmp->type != type) + continue; + + if (tvlv_handler_tmp->version != version) + continue; + + if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) + continue; + + tvlv_handler = tvlv_handler_tmp; + break; + } + rcu_read_unlock(); + + return tvlv_handler; +} + +/** + * batadv_tvlv_container_release - release tvlv from lists and free + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_container_release(struct kref *ref) +{ + struct batadv_tvlv_container *tvlv; + + tvlv = container_of(ref, struct batadv_tvlv_container, refcount); + kfree(tvlv); +} + +/** + * batadv_tvlv_container_put - decrement the tvlv container refcounter and + * possibly release it + * @tvlv: the tvlv container to free + */ +static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) +{ + kref_put(&tvlv->refcount, batadv_tvlv_container_release); +} + +/** + * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container + * list based on the provided type and version (both need to match) + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv container type to look for + * @version: tvlv container version to look for + * + * Has to be called with the appropriate locks being acquired + * (tvlv.container_list_lock). + * + * Return: tvlv container if found or NULL otherwise. + */ +static struct batadv_tvlv_container * +batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) +{ + struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; + + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + + hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { + if (tvlv_tmp->tvlv_hdr.type != type) + continue; + + if (tvlv_tmp->tvlv_hdr.version != version) + continue; + + kref_get(&tvlv_tmp->refcount); + tvlv = tvlv_tmp; + break; + } + + return tvlv; +} + +/** + * batadv_tvlv_container_list_size - calculate the size of the tvlv container + * list entries + * @bat_priv: the bat priv with all the soft interface information + * + * Has to be called with the appropriate locks being acquired + * (tvlv.container_list_lock). + * + * Return: size of all currently registered tvlv containers in bytes. + */ +static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) +{ + struct batadv_tvlv_container *tvlv; + u16 tvlv_len = 0; + + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + + hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { + tvlv_len += sizeof(struct batadv_tvlv_hdr); + tvlv_len += ntohs(tvlv->tvlv_hdr.len); + } + + return tvlv_len; +} + +/** + * batadv_tvlv_container_remove - remove tvlv container from the tvlv container + * list + * @bat_priv: the bat priv with all the soft interface information + * @tvlv: the to be removed tvlv container + * + * Has to be called with the appropriate locks being acquired + * (tvlv.container_list_lock). + */ +static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, + struct batadv_tvlv_container *tvlv) +{ + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + + if (!tvlv) + return; + + hlist_del(&tvlv->list); + + /* first call to decrement the counter, second call to free */ + batadv_tvlv_container_put(tvlv); + batadv_tvlv_container_put(tvlv); +} + +/** + * batadv_tvlv_container_unregister - unregister tvlv container based on the + * provided type and version (both need to match) + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv container type to unregister + * @version: tvlv container type to unregister + */ +void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, + u8 type, u8 version) +{ + struct batadv_tvlv_container *tvlv; + + spin_lock_bh(&bat_priv->tvlv.container_list_lock); + tvlv = batadv_tvlv_container_get(bat_priv, type, version); + batadv_tvlv_container_remove(bat_priv, tvlv); + spin_unlock_bh(&bat_priv->tvlv.container_list_lock); +} + +/** + * batadv_tvlv_container_register - register tvlv type, version and content + * to be propagated with each (primary interface) OGM + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv container type + * @version: tvlv container version + * @tvlv_value: tvlv container content + * @tvlv_value_len: tvlv container content length + * + * If a container of the same type and version was already registered the new + * content is going to replace the old one. + */ +void batadv_tvlv_container_register(struct batadv_priv *bat_priv, + u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len) +{ + struct batadv_tvlv_container *tvlv_old, *tvlv_new; + + if (!tvlv_value) + tvlv_value_len = 0; + + tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC); + if (!tvlv_new) + return; + + tvlv_new->tvlv_hdr.version = version; + tvlv_new->tvlv_hdr.type = type; + tvlv_new->tvlv_hdr.len = htons(tvlv_value_len); + + memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); + INIT_HLIST_NODE(&tvlv_new->list); + kref_init(&tvlv_new->refcount); + + spin_lock_bh(&bat_priv->tvlv.container_list_lock); + tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); + batadv_tvlv_container_remove(bat_priv, tvlv_old); + hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); + spin_unlock_bh(&bat_priv->tvlv.container_list_lock); +} + +/** + * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate + * requested packet size + * @packet_buff: packet buffer + * @packet_buff_len: packet buffer size + * @min_packet_len: requested packet minimum size + * @additional_packet_len: requested additional packet size on top of minimum + * size + * + * Return: true of the packet buffer could be changed to the requested size, + * false otherwise. + */ +static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, + int *packet_buff_len, + int min_packet_len, + int additional_packet_len) +{ + unsigned char *new_buff; + + new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); + + /* keep old buffer if kmalloc should fail */ + if (!new_buff) + return false; + + memcpy(new_buff, *packet_buff, min_packet_len); + kfree(*packet_buff); + *packet_buff = new_buff; + *packet_buff_len = min_packet_len + additional_packet_len; + + return true; +} + +/** + * batadv_tvlv_container_ogm_append - append tvlv container content to given + * OGM packet buffer + * @bat_priv: the bat priv with all the soft interface information + * @packet_buff: ogm packet buffer + * @packet_buff_len: ogm packet buffer size including ogm header and tvlv + * content + * @packet_min_len: ogm header size to be preserved for the OGM itself + * + * The ogm packet might be enlarged or shrunk depending on the current size + * and the size of the to-be-appended tvlv containers. + * + * Return: size of all appended tvlv containers in bytes. + */ +u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, + unsigned char **packet_buff, + int *packet_buff_len, int packet_min_len) +{ + struct batadv_tvlv_container *tvlv; + struct batadv_tvlv_hdr *tvlv_hdr; + u16 tvlv_value_len; + void *tvlv_value; + bool ret; + + spin_lock_bh(&bat_priv->tvlv.container_list_lock); + tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); + + ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, + packet_min_len, tvlv_value_len); + + if (!ret) + goto end; + + if (!tvlv_value_len) + goto end; + + tvlv_value = (*packet_buff) + packet_min_len; + + hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { + tvlv_hdr = tvlv_value; + tvlv_hdr->type = tvlv->tvlv_hdr.type; + tvlv_hdr->version = tvlv->tvlv_hdr.version; + tvlv_hdr->len = tvlv->tvlv_hdr.len; + tvlv_value = tvlv_hdr + 1; + memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); + tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); + } + +end: + spin_unlock_bh(&bat_priv->tvlv.container_list_lock); + return tvlv_value_len; +} + +/** + * batadv_tvlv_call_handler - parse the given tvlv buffer to call the + * appropriate handlers + * @bat_priv: the bat priv with all the soft interface information + * @tvlv_handler: tvlv callback function handling the tvlv content + * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet + * @orig_node: orig node emitting the ogm packet + * @src: source mac address of the unicast packet + * @dst: destination mac address of the unicast packet + * @tvlv_value: tvlv content + * @tvlv_value_len: tvlv content length + * + * Return: success if handler was not found or the return value of the handler + * callback. + */ +static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, + struct batadv_tvlv_handler *tvlv_handler, + bool ogm_source, + struct batadv_orig_node *orig_node, + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len) +{ + if (!tvlv_handler) + return NET_RX_SUCCESS; + + if (ogm_source) { + if (!tvlv_handler->ogm_handler) + return NET_RX_SUCCESS; + + if (!orig_node) + return NET_RX_SUCCESS; + + tvlv_handler->ogm_handler(bat_priv, orig_node, + BATADV_NO_FLAGS, + tvlv_value, tvlv_value_len); + tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; + } else { + if (!src) + return NET_RX_SUCCESS; + + if (!dst) + return NET_RX_SUCCESS; + + if (!tvlv_handler->unicast_handler) + return NET_RX_SUCCESS; + + return tvlv_handler->unicast_handler(bat_priv, src, + dst, tvlv_value, + tvlv_value_len); + } + + return NET_RX_SUCCESS; +} + +/** + * batadv_tvlv_containers_process - parse the given tvlv buffer to call the + * appropriate handlers + * @bat_priv: the bat priv with all the soft interface information + * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet + * @orig_node: orig node emitting the ogm packet + * @src: source mac address of the unicast packet + * @dst: destination mac address of the unicast packet + * @tvlv_value: tvlv content + * @tvlv_value_len: tvlv content length + * + * Return: success when processing an OGM or the return value of all called + * handler callbacks. + */ +int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, + bool ogm_source, + struct batadv_orig_node *orig_node, + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len) +{ + struct batadv_tvlv_handler *tvlv_handler; + struct batadv_tvlv_hdr *tvlv_hdr; + u16 tvlv_value_cont_len; + u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; + int ret = NET_RX_SUCCESS; + + while (tvlv_value_len >= sizeof(*tvlv_hdr)) { + tvlv_hdr = tvlv_value; + tvlv_value_cont_len = ntohs(tvlv_hdr->len); + tvlv_value = tvlv_hdr + 1; + tvlv_value_len -= sizeof(*tvlv_hdr); + + if (tvlv_value_cont_len > tvlv_value_len) + break; + + tvlv_handler = batadv_tvlv_handler_get(bat_priv, + tvlv_hdr->type, + tvlv_hdr->version); + + ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, + ogm_source, orig_node, + src, dst, tvlv_value, + tvlv_value_cont_len); + if (tvlv_handler) + batadv_tvlv_handler_put(tvlv_handler); + tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; + tvlv_value_len -= tvlv_value_cont_len; + } + + if (!ogm_source) + return ret; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tvlv_handler, + &bat_priv->tvlv.handler_list, list) { + if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && + !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) + tvlv_handler->ogm_handler(bat_priv, orig_node, + cifnotfound, NULL, 0); + + tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED; + } + rcu_read_unlock(); + + return NET_RX_SUCCESS; +} + +/** + * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate + * handlers + * @bat_priv: the bat priv with all the soft interface information + * @batadv_ogm_packet: ogm packet containing the tvlv containers + * @orig_node: orig node emitting the ogm packet + */ +void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, + struct batadv_ogm_packet *batadv_ogm_packet, + struct batadv_orig_node *orig_node) +{ + void *tvlv_value; + u16 tvlv_value_len; + + if (!batadv_ogm_packet) + return; + + tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len); + if (!tvlv_value_len) + return; + + tvlv_value = batadv_ogm_packet + 1; + + batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, + tvlv_value, tvlv_value_len); +} + +/** + * batadv_tvlv_handler_register - register tvlv handler based on the provided + * type and version (both need to match) for ogm tvlv payload and/or unicast + * payload + * @bat_priv: the bat priv with all the soft interface information + * @optr: ogm tvlv handler callback function. This function receives the orig + * node, flags and the tvlv content as argument to process. + * @uptr: unicast tvlv handler callback function. This function receives the + * source & destination of the unicast packet as well as the tvlv content + * to process. + * @type: tvlv handler type to be registered + * @version: tvlv handler version to be registered + * @flags: flags to enable or disable TVLV API behavior + */ +void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, + void (*optr)(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 flags, + void *tvlv_value, + u16 tvlv_value_len), + int (*uptr)(struct batadv_priv *bat_priv, + u8 *src, u8 *dst, + void *tvlv_value, + u16 tvlv_value_len), + u8 type, u8 version, u8 flags) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); + if (tvlv_handler) { + batadv_tvlv_handler_put(tvlv_handler); + return; + } + + tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); + if (!tvlv_handler) + return; + + tvlv_handler->ogm_handler = optr; + tvlv_handler->unicast_handler = uptr; + tvlv_handler->type = type; + tvlv_handler->version = version; + tvlv_handler->flags = flags; + kref_init(&tvlv_handler->refcount); + INIT_HLIST_NODE(&tvlv_handler->list); + + spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); +} + +/** + * batadv_tvlv_handler_unregister - unregister tvlv handler based on the + * provided type and version (both need to match) + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv handler type to be unregistered + * @version: tvlv handler version to be unregistered + */ +void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, + u8 type, u8 version) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); + if (!tvlv_handler) + return; + + batadv_tvlv_handler_put(tvlv_handler); + spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + hlist_del_rcu(&tvlv_handler->list); + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); + batadv_tvlv_handler_put(tvlv_handler); +} + +/** + * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the + * specified host + * @bat_priv: the bat priv with all the soft interface information + * @src: source mac address of the unicast packet + * @dst: destination mac address of the unicast packet + * @type: tvlv type + * @version: tvlv version + * @tvlv_value: tvlv content + * @tvlv_value_len: tvlv content length + */ +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, + u8 *dst, u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len) +{ + struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; + struct batadv_tvlv_hdr *tvlv_hdr; + struct batadv_orig_node *orig_node; + struct sk_buff *skb; + unsigned char *tvlv_buff; + unsigned int tvlv_len; + ssize_t hdr_len = sizeof(*unicast_tvlv_packet); + int res; + + orig_node = batadv_orig_hash_find(bat_priv, dst); + if (!orig_node) + return; + + tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len; + + skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len); + if (!skb) + goto out; + + skb->priority = TC_PRIO_CONTROL; + skb_reserve(skb, ETH_HLEN); + tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len); + unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff; + unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV; + unicast_tvlv_packet->version = BATADV_COMPAT_VERSION; + unicast_tvlv_packet->ttl = BATADV_TTL; + unicast_tvlv_packet->reserved = 0; + unicast_tvlv_packet->tvlv_len = htons(tvlv_len); + unicast_tvlv_packet->align = 0; + ether_addr_copy(unicast_tvlv_packet->src, src); + ether_addr_copy(unicast_tvlv_packet->dst, dst); + + tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1); + tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff; + tvlv_hdr->version = version; + tvlv_hdr->type = type; + tvlv_hdr->len = htons(tvlv_value_len); + tvlv_buff += sizeof(*tvlv_hdr); + memcpy(tvlv_buff, tvlv_value, tvlv_value_len); + + res = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (res == -1) + kfree_skb(skb); +out: + batadv_orig_node_put(orig_node); +} diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h new file mode 100644 index 000000000..e4369b547 --- /dev/null +++ b/net/batman-adv/tvlv.h @@ -0,0 +1,61 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_TVLV_H_ +#define _NET_BATMAN_ADV_TVLV_H_ + +#include "main.h" + +#include + +struct batadv_ogm_packet; + +void batadv_tvlv_container_register(struct batadv_priv *bat_priv, + u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len); +u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, + unsigned char **packet_buff, + int *packet_buff_len, int packet_min_len); +void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, + struct batadv_ogm_packet *batadv_ogm_packet, + struct batadv_orig_node *orig_node); +void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, + u8 type, u8 version); + +void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, + void (*optr)(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 flags, + void *tvlv_value, + u16 tvlv_value_len), + int (*uptr)(struct batadv_priv *bat_priv, + u8 *src, u8 *dst, + void *tvlv_value, + u16 tvlv_value_len), + u8 type, u8 version, u8 flags); +void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, + u8 type, u8 version); +int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, + bool ogm_source, + struct batadv_orig_node *orig_node, + u8 *src, u8 *dst, + void *tvlv_buff, u16 tvlv_buff_len); +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, + u8 *dst, u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len); + +#endif /* _NET_BATMAN_ADV_TVLV_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 74d865a4d..a64522c3b 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -33,6 +33,7 @@ #include #include #include +#include #include "packet.h" @@ -709,6 +710,8 @@ struct batadv_priv_debug_log { * @list: list of available gateway nodes * @list_lock: lock protecting gw_list & curr_gw * @curr_gw: pointer to currently selected gateway node + * @mode: gateway operation: off, client or server (see batadv_gw_modes) + * @sel_class: gateway selection class (applies if gw_mode client) * @bandwidth_down: advertised uplink download bandwidth (if gw_mode server) * @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server) * @reselect: bool indicating a gateway re-selection is in progress @@ -717,6 +720,8 @@ struct batadv_priv_gw { struct hlist_head list; spinlock_t list_lock; /* protects gw_list & curr_gw */ struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */ + atomic_t mode; + atomic_t sel_class; atomic_t bandwidth_down; atomic_t bandwidth_up; atomic_t reselect; @@ -752,6 +757,17 @@ struct batadv_priv_dat { #endif #ifdef CONFIG_BATMAN_ADV_MCAST +/** + * struct batadv_mcast_querier_state - IGMP/MLD querier state when bridged + * @exists: whether a querier exists in the mesh + * @shadowing: if a querier exists, whether it is potentially shadowing + * multicast listeners (i.e. querier is behind our own bridge segment) + */ +struct batadv_mcast_querier_state { + bool exists; + bool shadowing; +}; + /** * struct batadv_priv_mcast - per mesh interface mcast data * @mla_list: list of multicast addresses we are currently announcing via TT @@ -759,8 +775,11 @@ struct batadv_priv_dat { * multicast traffic * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast traffic * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic + * @querier_ipv4: the current state of an IGMP querier in the mesh + * @querier_ipv6: the current state of an MLD querier in the mesh * @flags: the flags we have last sent in our mcast tvlv * @enabled: whether the multicast tvlv is currently enabled + * @bridged: whether the soft interface has a bridge on top * @num_disabled: number of nodes that have no mcast tvlv * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP traffic * @num_want_all_ipv4: counter for items in want_all_ipv4_list @@ -773,8 +792,11 @@ struct batadv_priv_mcast { struct hlist_head want_all_unsnoopables_list; struct hlist_head want_all_ipv4_list; struct hlist_head want_all_ipv6_list; + struct batadv_mcast_querier_state querier_ipv4; + struct batadv_mcast_querier_state querier_ipv6; u8 flags; bool enabled; + bool bridged; atomic_t num_disabled; atomic_t num_want_all_unsnoopables; atomic_t num_want_all_ipv4; @@ -813,6 +835,111 @@ struct batadv_priv_nc { struct batadv_hashtable *decoding_hash; }; +/** + * struct batadv_tp_unacked - unacked packet meta-information + * @seqno: seqno of the unacked packet + * @len: length of the packet + * @list: list node for batadv_tp_vars::unacked_list + * + * This struct is supposed to represent a buffer unacked packet. However, since + * the purpose of the TP meter is to count the traffic only, there is no need to + * store the entire sk_buff, the starting offset and the length are enough + */ +struct batadv_tp_unacked { + u32 seqno; + u16 len; + struct list_head list; +}; + +/** + * enum batadv_tp_meter_role - Modus in tp meter session + * @BATADV_TP_RECEIVER: Initialized as receiver + * @BATADV_TP_SENDER: Initialized as sender + */ +enum batadv_tp_meter_role { + BATADV_TP_RECEIVER, + BATADV_TP_SENDER +}; + +/** + * struct batadv_tp_vars - tp meter private variables per session + * @list: list node for bat_priv::tp_list + * @timer: timer for ack (receiver) and retry (sender) + * @bat_priv: pointer to the mesh object + * @start_time: start time in jiffies + * @other_end: mac address of remote + * @role: receiver/sender modi + * @sending: sending binary semaphore: 1 if sending, 0 is not + * @reason: reason for a stopped session + * @finish_work: work item for the finishing procedure + * @test_length: test length in milliseconds + * @session: TP session identifier + * @icmp_uid: local ICMP "socket" index + * @dec_cwnd: decimal part of the cwnd used during linear growth + * @cwnd: current size of the congestion window + * @cwnd_lock: lock do protect @cwnd & @dec_cwnd + * @ss_threshold: Slow Start threshold. Once cwnd exceeds this value the + * connection switches to the Congestion Avoidance state + * @last_acked: last acked byte + * @last_sent: last sent byte, not yet acked + * @tot_sent: amount of data sent/ACKed so far + * @dup_acks: duplicate ACKs counter + * @fast_recovery: true if in Fast Recovery mode + * @recover: last sent seqno when entering Fast Recovery + * @rto: sender timeout + * @srtt: smoothed RTT scaled by 2^3 + * @rttvar: RTT variation scaled by 2^2 + * @more_bytes: waiting queue anchor when waiting for more ack/retry timeout + * @prerandom_offset: offset inside the prerandom buffer + * @prerandom_lock: spinlock protecting access to prerandom_offset + * @last_recv: last in-order received packet + * @unacked_list: list of unacked packets (meta-info only) + * @unacked_lock: protect unacked_list + * @last_recv_time: time time (jiffies) a msg was received + * @refcount: number of context where the object is used + * @rcu: struct used for freeing in an RCU-safe manner + */ +struct batadv_tp_vars { + struct hlist_node list; + struct timer_list timer; + struct batadv_priv *bat_priv; + unsigned long start_time; + u8 other_end[ETH_ALEN]; + enum batadv_tp_meter_role role; + atomic_t sending; + enum batadv_tp_meter_reason reason; + struct delayed_work finish_work; + u32 test_length; + u8 session[2]; + u8 icmp_uid; + + /* sender variables */ + u16 dec_cwnd; + u32 cwnd; + spinlock_t cwnd_lock; /* Protects cwnd & dec_cwnd */ + u32 ss_threshold; + atomic_t last_acked; + u32 last_sent; + atomic64_t tot_sent; + atomic_t dup_acks; + bool fast_recovery; + u32 recover; + u32 rto; + u32 srtt; + u32 rttvar; + wait_queue_head_t more_bytes; + u32 prerandom_offset; + spinlock_t prerandom_lock; /* Protects prerandom_offset */ + + /* receiver variables */ + u32 last_recv; + struct list_head unacked_list; + spinlock_t unacked_lock; /* Protects unacked_list */ + unsigned long last_recv_time; + struct kref refcount; + struct rcu_head rcu; +}; + /** * struct batadv_softif_vlan - per VLAN attributes set * @bat_priv: pointer to the mesh object @@ -867,8 +994,6 @@ struct batadv_priv_bat_v { * enabled * @multicast_mode: Enable or disable multicast optimizations on this node's * sender/originating side - * @gw_mode: gateway operation: off, client or server (see batadv_gw_modes) - * @gw_sel_class: gateway selection class (applies if gw_mode client) * @orig_interval: OGM broadcast interval in milliseconds * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop * @log_level: configured log level (see batadv_dbg_level) @@ -883,14 +1008,17 @@ struct batadv_priv_bat_v { * @debug_dir: dentry for debugfs batman-adv subdirectory * @forw_bat_list: list of aggregated OGMs that will be forwarded * @forw_bcast_list: list of broadcast packets that will be rebroadcasted + * @tp_list: list of tp sessions + * @tp_num: number of currently active tp sessions * @orig_hash: hash table containing mesh participants (orig nodes) * @forw_bat_list_lock: lock protecting forw_bat_list * @forw_bcast_list_lock: lock protecting forw_bcast_list + * @tp_list_lock: spinlock protecting @tp_list * @orig_work: work queue callback item for orig node purging * @cleanup_work: work queue callback item for soft-interface deinit * @primary_if: one of the hard-interfaces assigned to this mesh interface * becomes the primary interface - * @bat_algo_ops: routing algorithm used by this mesh interface + * @algo_ops: routing algorithm used by this mesh interface * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top * of the mesh interface represented by this object * @softif_vlan_list_lock: lock protecting softif_vlan_list @@ -924,8 +1052,6 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_MCAST atomic_t multicast_mode; #endif - atomic_t gw_mode; - atomic_t gw_sel_class; atomic_t orig_interval; atomic_t hop_penalty; #ifdef CONFIG_BATMAN_ADV_DEBUG @@ -941,13 +1067,16 @@ struct batadv_priv { struct dentry *debug_dir; struct hlist_head forw_bat_list; struct hlist_head forw_bcast_list; + struct hlist_head tp_list; struct batadv_hashtable *orig_hash; spinlock_t forw_bat_list_lock; /* protects forw_bat_list */ spinlock_t forw_bcast_list_lock; /* protects forw_bcast_list */ + spinlock_t tp_list_lock; /* protects tp_list */ + atomic_t tp_num; struct delayed_work orig_work; struct work_struct cleanup_work; struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */ - struct batadv_algo_ops *bat_algo_ops; + struct batadv_algo_ops *algo_ops; struct hlist_head softif_vlan_list; spinlock_t softif_vlan_list_lock; /* protects softif_vlan_list */ #ifdef CONFIG_BATMAN_ADV_BLA @@ -1264,67 +1393,78 @@ struct batadv_forw_packet { struct batadv_hard_iface *if_outgoing; }; +/** + * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific) + * @activate: start routing mechanisms when hard-interface is brought up + * @enable: init routing info when hard-interface is enabled + * @disable: de-init routing info when hard-interface is disabled + * @update_mac: (re-)init mac addresses of the protocol information + * belonging to this hard-interface + * @primary_set: called when primary interface is selected / changed + */ +struct batadv_algo_iface_ops { + void (*activate)(struct batadv_hard_iface *hard_iface); + int (*enable)(struct batadv_hard_iface *hard_iface); + void (*disable)(struct batadv_hard_iface *hard_iface); + void (*update_mac)(struct batadv_hard_iface *hard_iface); + void (*primary_set)(struct batadv_hard_iface *hard_iface); +}; + +/** + * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific) + * @hardif_init: called on creation of single hop entry + * @cmp: compare the metrics of two neighbors for their respective outgoing + * interfaces + * @is_similar_or_better: check if neigh1 is equally similar or better than + * neigh2 for their respective outgoing interface from the metric prospective + * @print: print the single hop neighbor list (optional) + */ +struct batadv_algo_neigh_ops { + void (*hardif_init)(struct batadv_hardif_neigh_node *neigh); + int (*cmp)(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2); + bool (*is_similar_or_better)(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2); + void (*print)(struct batadv_priv *priv, struct seq_file *seq); +}; + +/** + * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) + * @free: free the resources allocated by the routing algorithm for an orig_node + * object + * @add_if: ask the routing algorithm to apply the needed changes to the + * orig_node due to a new hard-interface being added into the mesh + * @del_if: ask the routing algorithm to apply the needed changes to the + * orig_node due to an hard-interface being removed from the mesh + * @print: print the originator table (optional) + */ +struct batadv_algo_orig_ops { + void (*free)(struct batadv_orig_node *orig_node); + int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num); + int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num, + int del_if_num); + void (*print)(struct batadv_priv *priv, struct seq_file *seq, + struct batadv_hard_iface *hard_iface); +}; + /** * struct batadv_algo_ops - mesh algorithm callbacks * @list: list node for the batadv_algo_list * @name: name of the algorithm - * @bat_iface_activate: start routing mechanisms when hard-interface is brought - * up - * @bat_iface_enable: init routing info when hard-interface is enabled - * @bat_iface_disable: de-init routing info when hard-interface is disabled - * @bat_iface_update_mac: (re-)init mac addresses of the protocol information - * belonging to this hard-interface - * @bat_primary_iface_set: called when primary interface is selected / changed - * @bat_ogm_schedule: prepare a new outgoing OGM for the send queue - * @bat_ogm_emit: send scheduled OGM - * @bat_hardif_neigh_init: called on creation of single hop entry - * @bat_neigh_cmp: compare the metrics of two neighbors for their respective - * outgoing interfaces - * @bat_neigh_is_similar_or_better: check if neigh1 is equally similar or - * better than neigh2 for their respective outgoing interface from the metric - * prospective - * @bat_neigh_print: print the single hop neighbor list (optional) - * @bat_neigh_free: free the resources allocated by the routing algorithm for a - * neigh_node object - * @bat_orig_print: print the originator table (optional) - * @bat_orig_free: free the resources allocated by the routing algorithm for an - * orig_node object - * @bat_orig_add_if: ask the routing algorithm to apply the needed changes to - * the orig_node due to a new hard-interface being added into the mesh - * @bat_orig_del_if: ask the routing algorithm to apply the needed changes to - * the orig_node due to an hard-interface being removed from the mesh + * @iface: callbacks related to interface handling + * @neigh: callbacks related to neighbors handling + * @orig: callbacks related to originators handling */ struct batadv_algo_ops { struct hlist_node list; char *name; - void (*bat_iface_activate)(struct batadv_hard_iface *hard_iface); - int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface); - void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface); - void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface); - void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface); - void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface); - void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet); - /* neigh_node handling API */ - void (*bat_hardif_neigh_init)(struct batadv_hardif_neigh_node *neigh); - int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1, - struct batadv_hard_iface *if_outgoing1, - struct batadv_neigh_node *neigh2, - struct batadv_hard_iface *if_outgoing2); - bool (*bat_neigh_is_similar_or_better) - (struct batadv_neigh_node *neigh1, - struct batadv_hard_iface *if_outgoing1, - struct batadv_neigh_node *neigh2, - struct batadv_hard_iface *if_outgoing2); - void (*bat_neigh_print)(struct batadv_priv *priv, struct seq_file *seq); - void (*bat_neigh_free)(struct batadv_neigh_node *neigh); - /* orig_node handling API */ - void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq, - struct batadv_hard_iface *hard_iface); - void (*bat_orig_free)(struct batadv_orig_node *orig_node); - int (*bat_orig_add_if)(struct batadv_orig_node *orig_node, - int max_if_num); - int (*bat_orig_del_if)(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num); + struct batadv_algo_iface_ops iface; + struct batadv_algo_neigh_ops neigh; + struct batadv_algo_orig_ops orig; }; /** diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 780089d75..d020299ba 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -627,20 +627,9 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } -static struct lock_class_key bt_tx_busylock; -static struct lock_class_key bt_netdev_xmit_lock_key; - -static void bt_set_lockdep_class_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, &bt_netdev_xmit_lock_key); -} - static int bt_dev_init(struct net_device *dev) { - netdev_for_each_tx_queue(dev, bt_set_lockdep_class_one, NULL); - dev->qdisc_tx_busylock = &bt_tx_busylock; + netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 3df7aefb7..0b5f729d0 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -215,6 +215,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct sock *sk = sock->sk; struct sk_buff *skb; size_t copied; + size_t skblen; int err; BT_DBG("sock %p sk %p len %zu", sock, sk, len); @@ -230,6 +231,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, return err; } + skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; @@ -248,6 +250,9 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, skb_free_datagram(sk, skb); + if (flags & MSG_TRUNC) + copied = skblen; + return err ? : copied; } EXPORT_SYMBOL(bt_sock_recvmsg); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index bf9f8a801..3809617aa 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -625,7 +625,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || hci_dev_test_flag(d, HCI_USER_CHANNEL) || - d->dev_type != HCI_BREDR) + d->dev_type != HCI_PRIMARY) continue; /* Simple routing: diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 45a9fc68c..ddf8432fe 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -260,14 +260,12 @@ static int hci_init1_req(struct hci_request *req, unsigned long opt) hci_reset_req(req, 0); switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: bredr_init(req); break; - case HCI_AMP: amp_init1(req); break; - default: BT_ERR("Unknown device type %d", hdev->dev_type); break; @@ -791,11 +789,11 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; - /* HCI_BREDR covers both single-mode LE, BR/EDR and dual-mode + /* HCI_PRIMARY covers both single-mode LE, BR/EDR and dual-mode * BR/EDR/LE type controllers. AMP controllers only need the * first two stages of init. */ - if (hdev->dev_type != HCI_BREDR) + if (hdev->dev_type != HCI_PRIMARY) return 0; err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT, NULL); @@ -1202,7 +1200,7 @@ int hci_inquiry(void __user *arg) goto done; } - if (hdev->dev_type != HCI_BREDR) { + if (hdev->dev_type != HCI_PRIMARY) { err = -EOPNOTSUPP; goto done; } @@ -1307,7 +1305,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) * since AMP controllers do not have an address. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - hdev->dev_type == HCI_BREDR && + hdev->dev_type == HCI_PRIMARY && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { ret = -EADDRNOTAVAIL; @@ -1402,7 +1400,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT) && - hdev->dev_type == HCI_BREDR) { + hdev->dev_type == HCI_PRIMARY) { ret = __hci_req_hci_power_on(hdev); mgmt_power_on(hdev, ret); } @@ -1563,7 +1561,7 @@ int hci_dev_do_close(struct hci_dev *hdev) auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); - if (!auto_off && hdev->dev_type == HCI_BREDR && + if (!auto_off && hdev->dev_type == HCI_PRIMARY && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); @@ -1802,7 +1800,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) goto done; } - if (hdev->dev_type != HCI_BREDR) { + if (hdev->dev_type != HCI_PRIMARY) { err = -EOPNOTSUPP; goto done; } @@ -2043,7 +2041,7 @@ static void hci_power_on(struct work_struct *work) */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || - (hdev->dev_type == HCI_BREDR && + (hdev->dev_type == HCI_PRIMARY && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); @@ -3030,7 +3028,7 @@ int hci_register_dev(struct hci_dev *hdev) * so the index can be used as the AMP controller ID. */ switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL); break; case HCI_AMP: @@ -3090,7 +3088,7 @@ int hci_register_dev(struct hci_dev *hdev) hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); - if (hdev->dev_type == HCI_BREDR) { + if (hdev->dev_type == HCI_PRIMARY) { /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ @@ -3165,6 +3163,8 @@ void hci_unregister_dev(struct hci_dev *hdev) device_del(&hdev->dev); debugfs_remove_recursive(hdev->debugfs); + kfree_const(hdev->hw_info); + kfree_const(hdev->fw_info); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); @@ -3268,6 +3268,28 @@ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL(hci_recv_diag); +void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...) +{ + va_list vargs; + + va_start(vargs, fmt); + kfree_const(hdev->hw_info); + hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); + va_end(vargs); +} +EXPORT_SYMBOL(hci_set_hw_info); + +void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...) +{ + va_list vargs; + + va_start(vargs, fmt); + kfree_const(hdev->fw_info); + hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); + va_end(vargs); +} +EXPORT_SYMBOL(hci_set_fw_info); + /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) @@ -3415,7 +3437,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: hci_add_acl_hdr(skb, conn->handle, flags); break; case HCI_AMP: @@ -3826,7 +3848,7 @@ static void hci_sched_acl(struct hci_dev *hdev) BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ - if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_BREDR) + if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY) return; /* No AMP link over AMP controller */ diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 7db422094..63df63ebf 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -76,6 +76,30 @@ static const struct file_operations __name ## _fops = { \ .llseek = default_llseek, \ } \ +#define DEFINE_INFO_ATTRIBUTE(__name, __field) \ +static int __name ## _show(struct seq_file *f, void *ptr) \ +{ \ + struct hci_dev *hdev = f->private; \ + \ + hci_dev_lock(hdev); \ + seq_printf(f, "%s\n", hdev->__field ? : ""); \ + hci_dev_unlock(hdev); \ + \ + return 0; \ +} \ + \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, __name ## _show, inode->i_private); \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .open = __name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} \ + static int features_show(struct seq_file *f, void *ptr) { struct hci_dev *hdev = f->private; @@ -349,6 +373,9 @@ static const struct file_operations sc_only_mode_fops = { .llseek = default_llseek, }; +DEFINE_INFO_ATTRIBUTE(hardware_info, hw_info); +DEFINE_INFO_ATTRIBUTE(firmware_info, fw_info); + void hci_debugfs_create_common(struct hci_dev *hdev) { debugfs_create_file("features", 0444, hdev->debugfs, hdev, @@ -382,6 +409,14 @@ void hci_debugfs_create_common(struct hci_dev *hdev) if (lmp_sc_capable(hdev) || lmp_le_capable(hdev)) debugfs_create_file("sc_only_mode", 0444, hdev->debugfs, hdev, &sc_only_mode_fops); + + if (hdev->hw_info) + debugfs_create_file("hardware_info", 0444, hdev->debugfs, + hdev, &hardware_info_fops); + + if (hdev->fw_info) + debugfs_create_file("firmware_info", 0444, hdev->debugfs, + hdev, &firmware_info_fops); } static int inquiry_cache_show(struct seq_file *f, void *p) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d4b3dd541..e17aacbc5 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2332,7 +2332,7 @@ static u8 hci_to_mgmt_reason(u8 err) static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_disconn_complete *ev = (void *) skb->data; - u8 reason = hci_to_mgmt_reason(ev->reason); + u8 reason; struct hci_conn_params *params; struct hci_conn *conn; bool mgmt_connected; @@ -2355,6 +2355,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->state = BT_CLOSED; mgmt_connected = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags); + + if (test_bit(HCI_CONN_AUTH_FAILURE, &conn->flags)) + reason = MGMT_DEV_DISCONN_AUTH_FAILURE; + else + reason = hci_to_mgmt_reason(ev->reason); + mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type, reason, mgmt_connected); @@ -2421,6 +2427,8 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; if (!ev->status) { + clear_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + if (!hci_conn_ssp_enabled(conn) && test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) { BT_INFO("re-auth of legacy device is not possible."); @@ -2429,6 +2437,9 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->sec_level = conn->pending_sec_level; } } else { + if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) + set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + mgmt_auth_failed(conn, ev->status); } @@ -2613,6 +2624,9 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); if (ev->status && conn->state == BT_CONNECTED) { + if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) + set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); hci_conn_drop(conn); goto unlock; @@ -3249,7 +3263,7 @@ static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev, struct hci_chan *chan; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: return hci_conn_hash_lookup_handle(hdev, handle); case HCI_AMP: chan = hci_chan_lookup_handle(hdev, handle); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index c045b3c54..b0e23dfc5 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -262,6 +262,8 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, break; } + kfree_skb(hdev->req_skb); + hdev->req_skb = NULL; hdev->req_status = hdev->req_result = 0; BT_DBG("%s end: err %d", hdev->name, err); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1298d723c..96f04b7b9 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -676,7 +676,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return -EOPNOTSUPP; - if (hdev->dev_type != HCI_BREDR) + if (hdev->dev_type != HCI_PRIMARY) return -EOPNOTSUPP; switch (cmd) { @@ -1048,6 +1048,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; + unsigned int skblen; BT_DBG("sock %p, sk %p", sock, sk); @@ -1064,6 +1065,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, if (!skb) return err; + skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; @@ -1089,6 +1091,9 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, skb_free_datagram(sk, skb); + if (flags & MSG_TRUNC) + copied = skblen; + return err ? : copied; } diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 555982a78..ca7a35eba 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -7,50 +7,6 @@ static struct class *bt_class; -static inline char *link_typetostr(int type) -{ - switch (type) { - case ACL_LINK: - return "ACL"; - case SCO_LINK: - return "SCO"; - case ESCO_LINK: - return "eSCO"; - case LE_LINK: - return "LE"; - default: - return "UNKNOWN"; - } -} - -static ssize_t show_link_type(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_conn *conn = to_hci_conn(dev); - return sprintf(buf, "%s\n", link_typetostr(conn->type)); -} - -static ssize_t show_link_address(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_conn *conn = to_hci_conn(dev); - return sprintf(buf, "%pMR\n", &conn->dst); -} - -#define LINK_ATTR(_name, _mode, _show, _store) \ -struct device_attribute link_attr_##_name = __ATTR(_name, _mode, _show, _store) - -static LINK_ATTR(type, S_IRUGO, show_link_type, NULL); -static LINK_ATTR(address, S_IRUGO, show_link_address, NULL); - -static struct attribute *bt_link_attrs[] = { - &link_attr_type.attr, - &link_attr_address.attr, - NULL -}; - -ATTRIBUTE_GROUPS(bt_link); - static void bt_link_release(struct device *dev) { struct hci_conn *conn = to_hci_conn(dev); @@ -59,7 +15,6 @@ static void bt_link_release(struct device *dev) static struct device_type bt_link = { .name = "link", - .groups = bt_link_groups, .release = bt_link_release, }; @@ -124,59 +79,6 @@ void hci_conn_del_sysfs(struct hci_conn *conn) hci_dev_put(hdev); } -static inline char *host_typetostr(int type) -{ - switch (type) { - case HCI_BREDR: - return "BR/EDR"; - case HCI_AMP: - return "AMP"; - default: - return "UNKNOWN"; - } -} - -static ssize_t show_type(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_dev *hdev = to_hci_dev(dev); - return sprintf(buf, "%s\n", host_typetostr(hdev->dev_type)); -} - -static ssize_t show_name(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_dev *hdev = to_hci_dev(dev); - char name[HCI_MAX_NAME_LENGTH + 1]; - int i; - - for (i = 0; i < HCI_MAX_NAME_LENGTH; i++) - name[i] = hdev->dev_name[i]; - - name[HCI_MAX_NAME_LENGTH] = '\0'; - return sprintf(buf, "%s\n", name); -} - -static ssize_t show_address(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_dev *hdev = to_hci_dev(dev); - return sprintf(buf, "%pMR\n", &hdev->bdaddr); -} - -static DEVICE_ATTR(type, S_IRUGO, show_type, NULL); -static DEVICE_ATTR(name, S_IRUGO, show_name, NULL); -static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); - -static struct attribute *bt_host_attrs[] = { - &dev_attr_type.attr, - &dev_attr_name.attr, - &dev_attr_address.attr, - NULL -}; - -ATTRIBUTE_GROUPS(bt_host); - static void bt_host_release(struct device *dev) { struct hci_dev *hdev = to_hci_dev(dev); @@ -186,7 +88,6 @@ static void bt_host_release(struct device *dev) static struct device_type bt_host = { .name = "host", - .groups = bt_host_groups, .release = bt_host_release, }; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index eb4f5f24c..d4cad29b0 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -5835,6 +5836,9 @@ static int l2cap_reassemble_sdu(struct l2cap_chan *chan, struct sk_buff *skb, if (chan->sdu) break; + if (!pskb_may_pull(skb, L2CAP_SDULEN_SIZE)) + break; + chan->sdu_len = get_unaligned_le16(skb->data); skb_pull(skb, L2CAP_SDULEN_SIZE); @@ -6610,6 +6614,10 @@ static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) goto drop; } + if ((chan->mode == L2CAP_MODE_ERTM || + chan->mode == L2CAP_MODE_STREAMING) && sk_filter(chan->data, skb)) + goto drop; + if (!control->sframe) { int err; @@ -7468,7 +7476,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) int len; /* For AMP controller do not create l2cap conn */ - if (!conn && hcon->hdev->dev_type != HCI_BREDR) + if (!conn && hcon->hdev->dev_type != HCI_PRIMARY) goto drop; if (!conn) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 1842141ba..a8ba75273 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1019,7 +1019,7 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg, goto done; if (pi->rx_busy_skb) { - if (!sock_queue_rcv_skb(sk, pi->rx_busy_skb)) + if (!__sock_queue_rcv_skb(sk, pi->rx_busy_skb)) pi->rx_busy_skb = NULL; else goto done; @@ -1270,7 +1270,17 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) goto done; } - err = sock_queue_rcv_skb(sk, skb); + if (chan->mode != L2CAP_MODE_ERTM && + chan->mode != L2CAP_MODE_STREAMING) { + /* Even if no filter is attached, we could potentially + * get errors from security modules, etc. + */ + err = sk_filter(sk, skb); + if (err) + goto done; + } + + err = __sock_queue_rcv_skb(sk, skb); /* For ERTM, handle one skb that doesn't fit into the recv * buffer. This is important to do because the data frames diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 9e4b93158..7639290b6 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 12 +#define MGMT_REVISION 13 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -359,7 +359,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && !hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -384,7 +384,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && !hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); @@ -419,7 +419,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -444,7 +444,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); @@ -479,7 +479,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR || d->dev_type == HCI_AMP) + if (d->dev_type == HCI_PRIMARY || d->dev_type == HCI_AMP) count++; } @@ -503,7 +503,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR) { + if (d->dev_type == HCI_PRIMARY) { if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) rp->entry[count].type = 0x01; else @@ -6366,7 +6366,7 @@ void mgmt_index_added(struct hci_dev *hdev) return; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); @@ -6399,7 +6399,7 @@ void mgmt_index_removed(struct hci_dev *hdev) return; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 50976a648..4c1a16a96 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -22,9 +22,9 @@ #include #include +#include #include #include -#include #include #include @@ -88,7 +88,7 @@ struct smp_dev { u8 min_key_size; u8 max_key_size; - struct crypto_skcipher *tfm_aes; + struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; }; @@ -127,7 +127,7 @@ struct smp_chan { u8 dhkey[32]; u8 mackey[16]; - struct crypto_skcipher *tfm_aes; + struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; }; @@ -361,10 +361,8 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16], * s1 and ah. */ -static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) +static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); - struct scatterlist sg; uint8_t tmp[16], data[16]; int err; @@ -378,7 +376,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) /* The most significant octet of key corresponds to k[0] */ swap_buf(k, tmp, 16); - err = crypto_skcipher_setkey(tfm, tmp, 16); + err = crypto_cipher_setkey(tfm, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; @@ -387,16 +385,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) /* Most significant octet of plaintextData corresponds to data[0] */ swap_buf(r, data, 16); - sg_init_one(&sg, data, 16); - - skcipher_request_set_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, 16, NULL); - - err = crypto_skcipher_encrypt(req); - skcipher_request_zero(req); - if (err) - BT_ERR("Encrypt data error %d", err); + crypto_cipher_encrypt_one(tfm, data, data); /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); @@ -406,7 +395,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) return err; } -static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16], +static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16], const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat, const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16]) { @@ -455,7 +444,7 @@ static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16], return err; } -static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16], +static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16], const u8 r1[16], const u8 r2[16], u8 _r[16]) { int err; @@ -471,7 +460,7 @@ static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16], return err; } -static int smp_ah(struct crypto_skcipher *tfm, const u8 irk[16], +static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16], const u8 r[3], u8 res[3]) { u8 _res[16]; @@ -759,7 +748,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn) kzfree(smp->slave_csrk); kzfree(smp->link_key); - crypto_free_skcipher(smp->tfm_aes); + crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); /* Ensure that we don't leave any debug key around if debug key @@ -1359,9 +1348,9 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) if (!smp) return NULL; - smp->tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(smp->tfm_aes)) { - BT_ERR("Unable to create ECB crypto context"); + BT_ERR("Unable to create AES crypto context"); kzfree(smp); return NULL; } @@ -1369,7 +1358,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_skcipher(smp->tfm_aes); + crypto_free_cipher(smp->tfm_aes); kzfree(smp); return NULL; } @@ -3120,7 +3109,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) { struct l2cap_chan *chan; struct smp_dev *smp; - struct crypto_skcipher *tfm_aes; + struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; if (cid == L2CAP_CID_SMP_BREDR) { @@ -3132,9 +3121,9 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) if (!smp) return ERR_PTR(-ENOMEM); - tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_aes)) { - BT_ERR("Unable to create ECB crypto context"); + BT_ERR("Unable to create AES crypto context"); kzfree(smp); return ERR_CAST(tfm_aes); } @@ -3142,7 +3131,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_skcipher(tfm_aes); + crypto_free_cipher(tfm_aes); kzfree(smp); return ERR_CAST(tfm_cmac); } @@ -3156,7 +3145,7 @@ create_chan: chan = l2cap_chan_create(); if (!chan) { if (smp) { - crypto_free_skcipher(smp->tfm_aes); + crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); kzfree(smp); } @@ -3203,7 +3192,7 @@ static void smp_del_chan(struct l2cap_chan *chan) smp = chan->data; if (smp) { chan->data = NULL; - crypto_free_skcipher(smp->tfm_aes); + crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); kzfree(smp); } @@ -3440,7 +3429,7 @@ void smp_unregister(struct hci_dev *hdev) #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP) -static int __init test_ah(struct crypto_skcipher *tfm_aes) +static int __init test_ah(struct crypto_cipher *tfm_aes) { const u8 irk[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, @@ -3460,7 +3449,7 @@ static int __init test_ah(struct crypto_skcipher *tfm_aes) return 0; } -static int __init test_c1(struct crypto_skcipher *tfm_aes) +static int __init test_c1(struct crypto_cipher *tfm_aes) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -3490,7 +3479,7 @@ static int __init test_c1(struct crypto_skcipher *tfm_aes) return 0; } -static int __init test_s1(struct crypto_skcipher *tfm_aes) +static int __init test_s1(struct crypto_cipher *tfm_aes) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -3686,7 +3675,7 @@ static const struct file_operations test_smp_fops = { .llseek = default_llseek, }; -static int __init run_selftests(struct crypto_skcipher *tfm_aes, +static int __init run_selftests(struct crypto_cipher *tfm_aes, struct crypto_shash *tfm_cmac) { ktime_t calltime, delta, rettime; @@ -3764,27 +3753,27 @@ done: int __init bt_selftest_smp(void) { - struct crypto_skcipher *tfm_aes; + struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; int err; - tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_aes)) { - BT_ERR("Unable to create ECB crypto context"); + BT_ERR("Unable to create AES crypto context"); return PTR_ERR(tfm_aes); } tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_skcipher(tfm_aes); + crypto_free_cipher(tfm_aes); return PTR_ERR(tfm_cmac); } err = run_selftests(tfm_aes, tfm_cmac); crypto_free_shash(tfm_cmac); - crypto_free_skcipher(tfm_aes); + crypto_free_cipher(tfm_aes); return err; } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 2c8095a5d..09f26940a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -61,11 +61,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; - if (is_broadcast_ether_addr(dest)) - br_flood_deliver(br, skb, false); - else if (is_multicast_ether_addr(dest)) { + if (is_broadcast_ether_addr(dest)) { + br_flood(br, skb, false, false, true); + } else if (is_multicast_ether_addr(dest)) { if (unlikely(netpoll_tx_running(dev))) { - br_flood_deliver(br, skb, false); + br_flood(br, skb, false, false, true); goto out; } if (br_multicast_rcv(br, NULL, skb, vid)) { @@ -76,14 +76,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb))) - br_multicast_deliver(mdst, skb); + br_multicast_flood(mdst, skb, false, true); else - br_flood_deliver(br, skb, false); - } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) - br_deliver(dst->dst, skb); - else - br_flood_deliver(br, skb, true); - + br_flood(br, skb, false, false, true); + } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) { + br_forward(dst->dst, skb, false, true); + } else { + br_flood(br, skb, true, false, true); + } out: rcu_read_unlock(); return NETDEV_TX_OK; @@ -104,8 +104,16 @@ static int br_dev_init(struct net_device *dev) return -ENOMEM; err = br_vlan_init(br); - if (err) + if (err) { free_percpu(br->stats); + return err; + } + + err = br_multicast_init_stats(br); + if (err) { + free_percpu(br->stats); + br_vlan_flush(br); + } br_set_lockdep_class(dev); return err; @@ -341,6 +349,8 @@ static const struct net_device_ops br_netdev_ops = { .ndo_add_slave = br_add_slave, .ndo_del_slave = br_del_slave, .ndo_fix_features = br_fix_features, + .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, + .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, .ndo_fdb_dump = br_fdb_dump, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index c18080ad4..cd620fab4 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -267,7 +267,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) /* If old entry was unassociated with any port, then delete it. */ f = __br_fdb_get(br, br->dev->dev_addr, 0); - if (f && f->is_local && !f->dst) + if (f && f->is_local && !f->dst && !f->added_by_user) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); @@ -282,7 +282,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (!br_vlan_should_use(v)) continue; f = __br_fdb_get(br, br->dev->dev_addr, v->vid); - if (f && f->is_local && !f->dst) + if (f && f->is_local && !f->dst && !f->added_by_user) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, v->vid); } @@ -764,20 +764,25 @@ out: } /* Update (create or replace) forwarding database entry */ -static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, - __u16 state, __u16 flags, __u16 vid) +static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, + const __u8 *addr, __u16 state, __u16 flags, __u16 vid) { - struct net_bridge *br = source->br; struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; bool modified = false; /* If the port cannot learn allow only local and static entries */ - if (!(state & NUD_PERMANENT) && !(state & NUD_NOARP) && + if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && !(source->state == BR_STATE_LEARNING || source->state == BR_STATE_FORWARDING)) return -EPERM; + if (!source && !(state & NUD_PERMANENT)) { + pr_info("bridge: RTM_NEWNEIGH %s without NUD_PERMANENT\n", + br->dev->name); + return -EINVAL; + } + fdb = fdb_find(head, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -832,22 +837,28 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, return 0; } -static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p, - const unsigned char *addr, u16 nlh_flags, u16 vid) +static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, + struct net_bridge_port *p, const unsigned char *addr, + u16 nlh_flags, u16 vid) { int err = 0; if (ndm->ndm_flags & NTF_USE) { + if (!p) { + pr_info("bridge: RTM_NEWNEIGH %s with NTF_USE is not supported\n", + br->dev->name); + return -EINVAL; + } local_bh_disable(); rcu_read_lock(); - br_fdb_update(p->br, p, addr, vid, true); + br_fdb_update(br, p, addr, vid, true); rcu_read_unlock(); local_bh_enable(); } else { - spin_lock_bh(&p->br->hash_lock); - err = fdb_add_entry(p, addr, ndm->ndm_state, + spin_lock_bh(&br->hash_lock); + err = fdb_add_entry(br, p, addr, ndm->ndm_state, nlh_flags, vid); - spin_unlock_bh(&p->br->hash_lock); + spin_unlock_bh(&br->hash_lock); } return err; @@ -884,6 +895,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], dev->name); return -EINVAL; } + br = p->br; vg = nbp_vlan_group(p); } @@ -895,15 +907,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } /* VID was specified, so use it. */ - if (dev->priv_flags & IFF_EBRIDGE) - err = br_fdb_insert(br, NULL, addr, vid); - else - err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid); } else { - if (dev->priv_flags & IFF_EBRIDGE) - err = br_fdb_insert(br, NULL, addr, 0); - else - err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0); if (err || !vg || !vg->num_vlans) goto out; @@ -914,11 +920,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - if (dev->priv_flags & IFF_EBRIDGE) - err = br_fdb_insert(br, NULL, addr, v->vid); - else - err = __br_fdb_add(ndm, p, addr, nlh_flags, - v->vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid); if (err) goto out; } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index f47759f05..63a83d8d7 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -21,11 +21,6 @@ #include #include "br_private.h" -static int deliver_clone(const struct net_bridge_port *prev, - struct sk_buff *skb, - void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb)); - /* Don't forward packets to originating port or forwarding disabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) @@ -75,105 +70,92 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(br_forward_finish); -static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +static void __br_forward(const struct net_bridge_port *to, + struct sk_buff *skb, bool local_orig) { struct net_bridge_vlan_group *vg; + struct net_device *indev; + struct net *net; + int br_hook; vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, vg, skb); if (!skb) return; + indev = skb->dev; skb->dev = to->dev; - - if (unlikely(netpoll_tx_running(to->br->dev))) { - if (!is_skb_forwardable(skb->dev, skb)) + if (!local_orig) { + if (skb_warn_if_lro(skb)) { kfree_skb(skb); - else { - skb_push(skb, ETH_HLEN); - br_netpoll_send_skb(to, skb); + return; } - return; + br_hook = NF_BR_FORWARD; + skb_forward_csum(skb); + net = dev_net(indev); + } else { + if (unlikely(netpoll_tx_running(to->br->dev))) { + if (!is_skb_forwardable(skb->dev, skb)) { + kfree_skb(skb); + } else { + skb_push(skb, ETH_HLEN); + br_netpoll_send_skb(to, skb); + } + return; + } + br_hook = NF_BR_LOCAL_OUT; + net = dev_net(skb->dev); + indev = NULL; } - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, - dev_net(skb->dev), NULL, skb,NULL, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, br_hook, + net, NULL, skb, indev, skb->dev, br_forward_finish); } -static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) +static int deliver_clone(const struct net_bridge_port *prev, + struct sk_buff *skb, bool local_orig) { - struct net_bridge_vlan_group *vg; - struct net_device *indev; - - if (skb_warn_if_lro(skb)) { - kfree_skb(skb); - return; - } - - vg = nbp_vlan_group_rcu(to); - skb = br_handle_vlan(to->br, vg, skb); - if (!skb) - return; - - indev = skb->dev; - skb->dev = to->dev; - skb_forward_csum(skb); - - NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, - dev_net(indev), NULL, skb, indev, skb->dev, - br_forward_finish); -} + struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; -/* called with rcu_read_lock */ -void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) -{ - if (to && should_deliver(to, skb)) { - __br_deliver(to, skb); - return; + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) { + dev->stats.tx_dropped++; + return -ENOMEM; } - kfree_skb(skb); + __br_forward(prev, skb, local_orig); + return 0; } -EXPORT_SYMBOL_GPL(br_deliver); -/* called with rcu_read_lock */ -void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0) +/** + * br_forward - forward a packet to a specific port + * @to: destination port + * @skb: packet being forwarded + * @local_rcv: packet will be received locally after forwarding + * @local_orig: packet is locally originated + * + * Should be called with rcu_read_lock. + */ +void br_forward(const struct net_bridge_port *to, + struct sk_buff *skb, bool local_rcv, bool local_orig) { if (to && should_deliver(to, skb)) { - if (skb0) - deliver_clone(to, skb, __br_forward); + if (local_rcv) + deliver_clone(to, skb, local_orig); else - __br_forward(to, skb); + __br_forward(to, skb, local_orig); return; } - if (!skb0) + if (!local_rcv) kfree_skb(skb); } - -static int deliver_clone(const struct net_bridge_port *prev, - struct sk_buff *skb, - void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb)) -{ - struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; - - skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) { - dev->stats.tx_dropped++; - return -ENOMEM; - } - - __packet_hook(prev, skb); - return 0; -} +EXPORT_SYMBOL_GPL(br_forward); static struct net_bridge_port *maybe_deliver( struct net_bridge_port *prev, struct net_bridge_port *p, - struct sk_buff *skb, - void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb)) + struct sk_buff *skb, bool local_orig) { int err; @@ -183,7 +165,7 @@ static struct net_bridge_port *maybe_deliver( if (!prev) goto out; - err = deliver_clone(prev, skb, __packet_hook); + err = deliver_clone(prev, skb, local_orig); if (err) return ERR_PTR(err); @@ -191,17 +173,13 @@ out: return p; } -/* called under bridge lock */ -static void br_flood(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb0, - void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb), - bool unicast) +/* called under rcu_read_lock */ +void br_flood(struct net_bridge *br, struct sk_buff *skb, + bool unicast, bool local_rcv, bool local_orig) { + u8 igmp_type = br_multicast_igmp_type(skb); + struct net_bridge_port *prev = NULL; struct net_bridge_port *p; - struct net_bridge_port *prev; - - prev = NULL; list_for_each_entry_rcu(p, &br->port_list, list) { /* Do not flood unicast traffic to ports that turn it off */ @@ -215,48 +193,36 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, BR_INPUT_SKB_CB(skb)->proxyarp_replied) continue; - prev = maybe_deliver(prev, p, skb, __packet_hook); + prev = maybe_deliver(prev, p, skb, local_orig); if (IS_ERR(prev)) goto out; + if (prev == p) + br_multicast_count(p->br, p, skb, igmp_type, + BR_MCAST_DIR_TX); } if (!prev) goto out; - if (skb0) - deliver_clone(prev, skb, __packet_hook); + if (local_rcv) + deliver_clone(prev, skb, local_orig); else - __packet_hook(prev, skb); + __br_forward(prev, skb, local_orig); return; out: - if (!skb0) + if (!local_rcv) kfree_skb(skb); } - -/* called with rcu_read_lock */ -void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast) -{ - br_flood(br, skb, NULL, __br_deliver, unicast); -} - -/* called under bridge lock */ -void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb2, bool unicast) -{ - br_flood(br, skb, skb2, __br_forward, unicast); -} - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING /* called with rcu_read_lock */ -static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, struct sk_buff *skb0, - void (*__packet_hook)( - const struct net_bridge_port *p, - struct sk_buff *skb)) +void br_multicast_flood(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, + bool local_rcv, bool local_orig) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; + u8 igmp_type = br_multicast_igmp_type(skb); struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; @@ -274,9 +240,12 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, port = (unsigned long)lport > (unsigned long)rport ? lport : rport; - prev = maybe_deliver(prev, port, skb, __packet_hook); + prev = maybe_deliver(prev, port, skb, local_orig); if (IS_ERR(prev)) goto out; + if (prev == port) + br_multicast_count(port->br, port, skb, igmp_type, + BR_MCAST_DIR_TX); if ((unsigned long)lport >= (unsigned long)port) p = rcu_dereference(p->next); @@ -287,28 +256,14 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, if (!prev) goto out; - if (skb0) - deliver_clone(prev, skb, __packet_hook); + if (local_rcv) + deliver_clone(prev, skb, local_orig); else - __packet_hook(prev, skb); + __br_forward(prev, skb, local_orig); return; out: - if (!skb0) + if (!local_rcv) kfree_skb(skb); } - -/* called with rcu_read_lock */ -void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb) -{ - br_multicast_flood(mdst, skb, NULL, __br_deliver); -} - -/* called with rcu_read_lock */ -void br_multicast_forward(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, struct sk_buff *skb2) -{ - br_multicast_flood(mdst, skb, skb2, __br_forward); -} #endif diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 8217aecf0..f2fede05d 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -345,8 +345,8 @@ static int find_portno(struct net_bridge *br) static struct net_bridge_port *new_nbp(struct net_bridge *br, struct net_device *dev) { - int index; struct net_bridge_port *p; + int index, err; index = find_portno(br); if (index < 0) @@ -366,7 +366,12 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); - br_multicast_add_port(p); + err = br_multicast_add_port(p); + if (err) { + dev_put(dev); + kfree(p); + p = ERR_PTR(err); + } return p; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 28d5ec269..abe11f085 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -60,6 +60,9 @@ static int br_pass_frame_up(struct sk_buff *skb) skb = br_handle_vlan(br, vg, skb); if (!skb) return NET_RX_DROP; + /* update the multicast stats if the packet is IGMP/MLD */ + br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), + BR_MCAST_DIR_TX); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, @@ -77,13 +80,10 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; - if (dev->flags & IFF_NOARP) + if ((dev->flags & IFF_NOARP) || + !pskb_may_pull(skb, arp_hdr_len(dev))) return; - if (!pskb_may_pull(skb, arp_hdr_len(dev))) { - dev->stats.tx_dropped++; - return; - } parp = arp_hdr(skb); if (parp->ar_pro != htons(ETH_P_IP) || @@ -128,13 +128,12 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - const unsigned char *dest = eth_hdr(skb)->h_dest; + bool local_rcv = false, mcast_hit = false, unicast = true; struct net_bridge_port *p = br_port_get_rcu(skb->dev); - struct net_bridge *br; - struct net_bridge_fdb_entry *dst; + const unsigned char *dest = eth_hdr(skb)->h_dest; + struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mdb_entry *mdst; - struct sk_buff *skb2; - bool unicast = true; + struct net_bridge *br; u16 vid = 0; if (!p || p->state == BR_STATE_DISABLED) @@ -157,53 +156,46 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb BR_INPUT_SKB_CB(skb)->brdev = br->dev; - /* The packet skb2 goes to the local host (NULL to skip). */ - skb2 = NULL; - - if (br->dev->flags & IFF_PROMISC) - skb2 = skb; - - dst = NULL; + local_rcv = !!(br->dev->flags & IFF_PROMISC); if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) br_do_proxy_arp(skb, br, vid, p); if (is_broadcast_ether_addr(dest)) { - skb2 = skb; + local_rcv = true; unicast = false; } else if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb))) { if ((mdst && mdst->mglist) || - br_multicast_is_router(br)) - skb2 = skb; - br_multicast_forward(mdst, skb, skb2); - skb = NULL; - if (!skb2) - goto out; - } else - skb2 = skb; - + br_multicast_is_router(br)) { + local_rcv = true; + br->dev->stats.multicast++; + } + mcast_hit = true; + } else { + local_rcv = true; + br->dev->stats.multicast++; + } unicast = false; - br->dev->stats.multicast++; - } else if ((dst = __br_fdb_get(br, dest, vid)) && - dst->is_local) { - skb2 = skb; + } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) { /* Do not forward the packet since it's local. */ - skb = NULL; + return br_pass_frame_up(skb); } - if (skb) { - if (dst) { - dst->used = jiffies; - br_forward(dst->dst, skb, skb2); - } else - br_flood_forward(br, skb, skb2, unicast); + if (dst) { + dst->used = jiffies; + br_forward(dst->dst, skb, local_rcv, false); + } else { + if (!mcast_hit) + br_flood(br, skb, unicast, local_rcv, false); + else + br_multicast_flood(mdst, skb, local_rcv, false); } - if (skb2) - return br_pass_frame_up(skb2); + if (local_rcv) + return br_pass_frame_up(skb); out: return 0; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index d3abdaefe..c5fea9393 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -361,7 +361,8 @@ out: } static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, - __be32 group) + __be32 group, + u8 *igmp_type) { struct sk_buff *skb; struct igmphdr *ih; @@ -411,6 +412,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, skb_set_transport_header(skb, skb->len); ih = igmp_hdr(skb); + *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; ih->type = IGMP_HOST_MEMBERSHIP_QUERY; ih->code = (group ? br->multicast_last_member_interval : br->multicast_query_response_interval) / @@ -428,7 +430,8 @@ out: #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, - const struct in6_addr *group) + const struct in6_addr *grp, + u8 *igmp_type) { struct sk_buff *skb; struct ipv6hdr *ip6h; @@ -487,16 +490,17 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, skb_set_transport_header(skb, skb->len); mldq = (struct mld_msg *) icmp6_hdr(skb); - interval = ipv6_addr_any(group) ? + interval = ipv6_addr_any(grp) ? br->multicast_query_response_interval : br->multicast_last_member_interval; + *igmp_type = ICMPV6_MGM_QUERY; mldq->mld_type = ICMPV6_MGM_QUERY; mldq->mld_code = 0; mldq->mld_cksum = 0; mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); mldq->mld_reserved = 0; - mldq->mld_mca = *group; + mldq->mld_mca = *grp; /* checksum */ mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -513,14 +517,16 @@ out: #endif static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, - struct br_ip *addr) + struct br_ip *addr, + u8 *igmp_type) { switch (addr->proto) { case htons(ETH_P_IP): - return br_ip4_multicast_alloc_query(br, addr->u.ip4); + return br_ip4_multicast_alloc_query(br, addr->u.ip4, igmp_type); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return br_ip6_multicast_alloc_query(br, &addr->u.ip6); + return br_ip6_multicast_alloc_query(br, &addr->u.ip6, + igmp_type); #endif } return NULL; @@ -829,18 +835,23 @@ static void __br_multicast_send_query(struct net_bridge *br, struct br_ip *ip) { struct sk_buff *skb; + u8 igmp_type; - skb = br_multicast_alloc_query(br, ip); + skb = br_multicast_alloc_query(br, ip, &igmp_type); if (!skb) return; if (port) { skb->dev = port->dev; + br_multicast_count(br, port, skb, igmp_type, + BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, dev_net(port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); } else { br_multicast_select_own_querier(br, ip, skb); + br_multicast_count(br, port, skb, igmp_type, + BR_MCAST_DIR_RX); netif_rx(skb); } } @@ -918,7 +929,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data) } #endif -void br_multicast_add_port(struct net_bridge_port *port) +int br_multicast_add_port(struct net_bridge_port *port) { port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; @@ -930,6 +941,11 @@ void br_multicast_add_port(struct net_bridge_port *port) setup_timer(&port->ip6_own_query.timer, br_ip6_multicast_port_query_expired, (unsigned long)port); #endif + port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); + if (!port->mcast_stats) + return -ENOMEM; + + return 0; } void br_multicast_del_port(struct net_bridge_port *port) @@ -944,6 +960,7 @@ void br_multicast_del_port(struct net_bridge_port *port) br_multicast_del_pg(br, pg); spin_unlock_bh(&br->multicast_lock); del_timer_sync(&port->multicast_router_timer); + free_percpu(port->mcast_stats); } static void br_multicast_enable(struct bridge_mcast_own_query *query) @@ -1583,6 +1600,39 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, } #endif +static void br_multicast_err_count(const struct net_bridge *br, + const struct net_bridge_port *p, + __be16 proto) +{ + struct bridge_mcast_stats __percpu *stats; + struct bridge_mcast_stats *pstats; + + if (!br->multicast_stats_enabled) + return; + + if (p) + stats = p->mcast_stats; + else + stats = br->mcast_stats; + if (WARN_ON(!stats)) + return; + + pstats = this_cpu_ptr(stats); + + u64_stats_update_begin(&pstats->syncp); + switch (proto) { + case htons(ETH_P_IP): + pstats->mstats.igmp_parse_errors++; + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + pstats->mstats.mld_parse_errors++; + break; +#endif + } + u64_stats_update_end(&pstats->syncp); +} + static int br_multicast_ipv4_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, @@ -1599,11 +1649,12 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; } else if (err < 0) { + br_multicast_err_count(br, port, skb->protocol); return err; } - BR_INPUT_SKB_CB(skb)->igmp = 1; ih = igmp_hdr(skb); + BR_INPUT_SKB_CB(skb)->igmp = ih->type; switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: @@ -1625,6 +1676,9 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, if (skb_trimmed && skb_trimmed != skb) kfree_skb(skb_trimmed); + br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, + BR_MCAST_DIR_RX); + return err; } @@ -1645,11 +1699,12 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; } else if (err < 0) { + br_multicast_err_count(br, port, skb->protocol); return err; } - BR_INPUT_SKB_CB(skb)->igmp = 1; mld = (struct mld_msg *)skb_transport_header(skb); + BR_INPUT_SKB_CB(skb)->igmp = mld->mld_type; switch (mld->mld_type) { case ICMPV6_MGM_REPORT: @@ -1670,6 +1725,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, if (skb_trimmed && skb_trimmed != skb) kfree_skb(skb_trimmed); + br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, + BR_MCAST_DIR_RX); + return err; } #endif @@ -1677,6 +1735,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid) { + int ret = 0; + BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; @@ -1685,14 +1745,16 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, switch (skb->protocol) { case htons(ETH_P_IP): - return br_multicast_ipv4_rcv(br, port, skb, vid); + ret = br_multicast_ipv4_rcv(br, port, skb, vid); + break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return br_multicast_ipv6_rcv(br, port, skb, vid); + ret = br_multicast_ipv6_rcv(br, port, skb, vid); + break; #endif } - return 0; + return ret; } static void br_multicast_query_expired(struct net_bridge *br, @@ -1831,6 +1893,8 @@ void br_multicast_dev_del(struct net_bridge *br) out: spin_unlock_bh(&br->multicast_lock); + + free_percpu(br->mcast_stats); } int br_multicast_set_router(struct net_bridge *br, unsigned long val) @@ -2185,3 +2249,154 @@ unlock: return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); + +static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, + const struct sk_buff *skb, u8 type, u8 dir) +{ + struct bridge_mcast_stats *pstats = this_cpu_ptr(stats); + __be16 proto = skb->protocol; + unsigned int t_len; + + u64_stats_update_begin(&pstats->syncp); + switch (proto) { + case htons(ETH_P_IP): + t_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + switch (type) { + case IGMP_HOST_MEMBERSHIP_REPORT: + pstats->mstats.igmp_v1reports[dir]++; + break; + case IGMPV2_HOST_MEMBERSHIP_REPORT: + pstats->mstats.igmp_v2reports[dir]++; + break; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + pstats->mstats.igmp_v3reports[dir]++; + break; + case IGMP_HOST_MEMBERSHIP_QUERY: + if (t_len != sizeof(struct igmphdr)) { + pstats->mstats.igmp_v3queries[dir]++; + } else { + unsigned int offset = skb_transport_offset(skb); + struct igmphdr *ih, _ihdr; + + ih = skb_header_pointer(skb, offset, + sizeof(_ihdr), &_ihdr); + if (!ih) + break; + if (!ih->code) + pstats->mstats.igmp_v1queries[dir]++; + else + pstats->mstats.igmp_v2queries[dir]++; + } + break; + case IGMP_HOST_LEAVE_MESSAGE: + pstats->mstats.igmp_leaves[dir]++; + break; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + t_len = ntohs(ipv6_hdr(skb)->payload_len) + + sizeof(struct ipv6hdr); + t_len -= skb_network_header_len(skb); + switch (type) { + case ICMPV6_MGM_REPORT: + pstats->mstats.mld_v1reports[dir]++; + break; + case ICMPV6_MLD2_REPORT: + pstats->mstats.mld_v2reports[dir]++; + break; + case ICMPV6_MGM_QUERY: + if (t_len != sizeof(struct mld_msg)) + pstats->mstats.mld_v2queries[dir]++; + else + pstats->mstats.mld_v1queries[dir]++; + break; + case ICMPV6_MGM_REDUCTION: + pstats->mstats.mld_leaves[dir]++; + break; + } + break; +#endif /* CONFIG_IPV6 */ + } + u64_stats_update_end(&pstats->syncp); +} + +void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, + const struct sk_buff *skb, u8 type, u8 dir) +{ + struct bridge_mcast_stats __percpu *stats; + + /* if multicast_disabled is true then igmp type can't be set */ + if (!type || !br->multicast_stats_enabled) + return; + + if (p) + stats = p->mcast_stats; + else + stats = br->mcast_stats; + if (WARN_ON(!stats)) + return; + + br_mcast_stats_add(stats, skb, type, dir); +} + +int br_multicast_init_stats(struct net_bridge *br) +{ + br->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); + if (!br->mcast_stats) + return -ENOMEM; + + return 0; +} + +static void mcast_stats_add_dir(u64 *dst, u64 *src) +{ + dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; + dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX]; +} + +void br_multicast_get_stats(const struct net_bridge *br, + const struct net_bridge_port *p, + struct br_mcast_stats *dest) +{ + struct bridge_mcast_stats __percpu *stats; + struct br_mcast_stats tdst; + int i; + + memset(dest, 0, sizeof(*dest)); + if (p) + stats = p->mcast_stats; + else + stats = br->mcast_stats; + if (WARN_ON(!stats)) + return; + + memset(&tdst, 0, sizeof(tdst)); + for_each_possible_cpu(i) { + struct bridge_mcast_stats *cpu_stats = per_cpu_ptr(stats, i); + struct br_mcast_stats temp; + unsigned int start; + + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); + mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); + mcast_stats_add_dir(tdst.igmp_v3queries, temp.igmp_v3queries); + mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves); + mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports); + mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports); + mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports); + tdst.igmp_parse_errors += temp.igmp_parse_errors; + + mcast_stats_add_dir(tdst.mld_v1queries, temp.mld_v1queries); + mcast_stats_add_dir(tdst.mld_v2queries, temp.mld_v2queries); + mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves); + mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports); + mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports); + tdst.mld_parse_errors += temp.mld_parse_errors; + } + memcpy(dest, &tdst, sizeof(*dest)); +} diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 85e89f693..f2a29e467 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -851,6 +851,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1055,6 +1056,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br->multicast_startup_query_interval = clock_t_to_jiffies(val); } + + if (data[IFLA_BR_MCAST_STATS_ENABLED]) { + __u8 mcast_stats; + + mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); + br->multicast_stats_enabled = !!mcast_stats; + } #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { @@ -1110,6 +1118,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ @@ -1187,6 +1196,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br->multicast_query_use_ifaddr) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || + nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, + br->multicast_stats_enabled) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, br->hash_elasticity) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || @@ -1234,7 +1245,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return 0; } -static size_t br_get_linkxstats_size(const struct net_device *dev) +static size_t bridge_get_linkxstats_size(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_vlan_group *vg; @@ -1242,53 +1253,88 @@ static size_t br_get_linkxstats_size(const struct net_device *dev) int numvls = 0; vg = br_vlan_group(br); - if (!vg) - return 0; - - /* we need to count all, even placeholder entries */ - list_for_each_entry(v, &vg->vlan_list, vlist) - numvls++; + if (vg) { + /* we need to count all, even placeholder entries */ + list_for_each_entry(v, &vg->vlan_list, vlist) + numvls++; + } - /* account for the vlans and the link xstats type nest attribute */ return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + + nla_total_size(sizeof(struct br_mcast_stats)) + nla_total_size(0); } -static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, - int *prividx) +static size_t brport_get_linkxstats_size(const struct net_device *dev) +{ + return nla_total_size(sizeof(struct br_mcast_stats)) + + nla_total_size(0); +} + +static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) +{ + size_t retsize = 0; + + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + retsize = bridge_get_linkxstats_size(dev); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + retsize = brport_get_linkxstats_size(dev); + break; + } + + return retsize; +} + +static int bridge_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx) { struct net_bridge *br = netdev_priv(dev); + struct nlattr *nla __maybe_unused; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct nlattr *nest; int vl_idx = 0; - vg = br_vlan_group(br); - if (!vg) - goto out; nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; - list_for_each_entry(v, &vg->vlan_list, vlist) { - struct bridge_vlan_xstats vxi; - struct br_vlan_stats stats; - if (++vl_idx < *prividx) - continue; - memset(&vxi, 0, sizeof(vxi)); - vxi.vid = v->vid; - br_vlan_get_stats(v, &stats); - vxi.rx_bytes = stats.rx_bytes; - vxi.rx_packets = stats.rx_packets; - vxi.tx_bytes = stats.tx_bytes; - vxi.tx_packets = stats.tx_packets; - - if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) + vg = br_vlan_group(br); + if (vg) { + list_for_each_entry(v, &vg->vlan_list, vlist) { + struct bridge_vlan_xstats vxi; + struct br_vlan_stats stats; + + if (++vl_idx < *prividx) + continue; + memset(&vxi, 0, sizeof(vxi)); + vxi.vid = v->vid; + br_vlan_get_stats(v, &stats); + vxi.rx_bytes = stats.rx_bytes; + vxi.rx_packets = stats.rx_packets; + vxi.tx_bytes = stats.tx_bytes; + vxi.tx_packets = stats.tx_packets; + + if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) + goto nla_put_failure; + } + } + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (++vl_idx >= *prividx) { + nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, + sizeof(struct br_mcast_stats), + BRIDGE_XSTATS_PAD); + if (!nla) goto nla_put_failure; + br_multicast_get_stats(br, NULL, nla_data(nla)); } +#endif nla_nest_end(skb, nest); *prividx = 0; -out: + return 0; nla_put_failure: @@ -1298,6 +1344,52 @@ nla_put_failure: return -EMSGSIZE; } +static int brport_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx) +{ + struct net_bridge_port *p = br_port_get_rtnl(dev); + struct nlattr *nla __maybe_unused; + struct nlattr *nest; + + if (!p) + return 0; + + nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); + if (!nest) + return -EMSGSIZE; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, + sizeof(struct br_mcast_stats), + BRIDGE_XSTATS_PAD); + if (!nla) { + nla_nest_end(skb, nest); + return -EMSGSIZE; + } + br_multicast_get_stats(p->br, p, nla_data(nla)); +#endif + nla_nest_end(skb, nest); + + return 0; +} + +static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, + int *prividx, int attr) +{ + int ret = -EINVAL; + + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + ret = bridge_fill_linkxstats(skb, dev, prividx); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + ret = brport_fill_linkxstats(skb, dev, prividx); + break; + } + + return ret; +} + static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 52edecf3c..aac2a6e6b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -75,6 +75,12 @@ struct bridge_mcast_querier { struct br_ip addr; struct net_bridge_port __rcu *port; }; + +/* IGMP/MLD statistics */ +struct bridge_mcast_stats { + struct br_mcast_stats mstats; + struct u64_stats_sync syncp; +}; #endif struct br_vlan_stats { @@ -229,6 +235,7 @@ struct net_bridge_port struct bridge_mcast_own_query ip6_own_query; #endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; + struct bridge_mcast_stats __percpu *mcast_stats; struct timer_list multicast_router_timer; struct hlist_head mglist; struct hlist_node rlist; @@ -315,6 +322,7 @@ struct net_bridge u8 multicast_querier:1; u8 multicast_query_use_ifaddr:1; u8 has_ipv6_addr:1; + u8 multicast_stats_enabled:1; u32 hash_elasticity; u32 hash_max; @@ -337,6 +345,7 @@ struct net_bridge struct bridge_mcast_other_query ip4_other_query; struct bridge_mcast_own_query ip4_own_query; struct bridge_mcast_querier ip4_querier; + struct bridge_mcast_stats __percpu *mcast_stats; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; @@ -496,14 +505,12 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid); /* br_forward.c */ -void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb); int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); -void br_forward(const struct net_bridge_port *to, - struct sk_buff *skb, struct sk_buff *skb0); +void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, + bool local_rcv, bool local_orig); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); -void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast); -void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb2, bool unicast); +void br_flood(struct net_bridge *br, struct sk_buff *skb, + bool unicast, bool local_rcv, bool local_orig); /* br_if.c */ void br_port_carrier_check(struct net_bridge_port *p); @@ -543,7 +550,7 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid); struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, struct sk_buff *skb, u16 vid); -void br_multicast_add_port(struct net_bridge_port *port); +int br_multicast_add_port(struct net_bridge_port *port); void br_multicast_del_port(struct net_bridge_port *port); void br_multicast_enable_port(struct net_bridge_port *port); void br_multicast_disable_port(struct net_bridge_port *port); @@ -551,10 +558,8 @@ void br_multicast_init(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); -void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb); -void br_multicast_forward(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, struct sk_buff *skb2); +void br_multicast_flood(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, bool local_rcv, bool local_orig); int br_multicast_set_router(struct net_bridge *br, unsigned long val); int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); int br_multicast_toggle(struct net_bridge *br, unsigned long val); @@ -576,6 +581,12 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_ip *group, int type, u8 flags); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, int type); +void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, + const struct sk_buff *skb, u8 type, u8 dir); +int br_multicast_init_stats(struct net_bridge *br); +void br_multicast_get_stats(const struct net_bridge *br, + const struct net_bridge_port *p, + struct br_mcast_stats *dest); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -623,6 +634,11 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, return false; } } + +static inline int br_multicast_igmp_type(const struct sk_buff *skb) +{ + return BR_INPUT_SKB_CB(skb)->igmp; +} #else static inline int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, @@ -638,8 +654,9 @@ static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return NULL; } -static inline void br_multicast_add_port(struct net_bridge_port *port) +static inline int br_multicast_add_port(struct net_bridge_port *port) { + return 0; } static inline void br_multicast_del_port(struct net_bridge_port *port) @@ -670,31 +687,47 @@ static inline void br_multicast_dev_del(struct net_bridge *br) { } -static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb) +static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, + bool local_rcv, bool local_orig) { } -static inline void br_multicast_forward(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, - struct sk_buff *skb2) -{ -} static inline bool br_multicast_is_router(struct net_bridge *br) { return 0; } + static inline bool br_multicast_querier_exists(struct net_bridge *br, struct ethhdr *eth) { return false; } + static inline void br_mdb_init(void) { } + static inline void br_mdb_uninit(void) { } + +static inline void br_multicast_count(struct net_bridge *br, + const struct net_bridge_port *p, + const struct sk_buff *skb, + u8 type, u8 dir) +{ +} + +static inline int br_multicast_init_stats(struct net_bridge *br) +{ + return 0; +} + +static inline int br_multicast_igmp_type(const struct sk_buff *skb) +{ + return 0; +} #endif /* br_vlan.c */ @@ -942,7 +975,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t); int br_set_forward_delay(struct net_bridge *br, unsigned long x); int br_set_hello_time(struct net_bridge *br, unsigned long x); int br_set_max_age(struct net_bridge *br, unsigned long x); -int br_set_ageing_time(struct net_bridge *br, u32 ageing_time); +int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); /* br_stp_if.c */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 9cb7044d0..9258b8ef1 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -570,7 +570,7 @@ int br_set_max_age(struct net_bridge *br, unsigned long val) * * Offloaded switch entries maybe more restrictive */ -int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) +int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) { struct switchdev_attr attr = { .orig_dev = br->dev, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 984d46263..341caa0ca 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -55,7 +55,7 @@ void br_init_port(struct net_bridge_port *p) netdev_err(p->dev, "failed to set HW ageing time\n"); } -/* called under bridge lock */ +/* NO locks held */ void br_stp_enable_bridge(struct net_bridge *br) { struct net_bridge_port *p; diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index beb47071e..e120307c6 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -618,6 +618,30 @@ static ssize_t multicast_startup_query_interval_store( return store_bridge_parm(d, buf, len, set_startup_query_interval); } static DEVICE_ATTR_RW(multicast_startup_query_interval); + +static ssize_t multicast_stats_enabled_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + + return sprintf(buf, "%u\n", br->multicast_stats_enabled); +} + +static int set_stats_enabled(struct net_bridge *br, unsigned long val) +{ + br->multicast_stats_enabled = !!val; + return 0; +} + +static ssize_t multicast_stats_enabled_store(struct device *d, + struct device_attribute *attr, + const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_stats_enabled); +} +static DEVICE_ATTR_RW(multicast_stats_enabled); #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( @@ -784,6 +808,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_interval.attr, &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, + &dev_attr_multicast_stats_enabled.attr, #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c index 2a449b7ab..5fc4affd9 100644 --- a/net/bridge/netfilter/ebt_802_3.c +++ b/net/bridge/netfilter/ebt_802_3.c @@ -20,16 +20,16 @@ ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par) __be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type; if (info->bitmask & EBT_802_3_SAP) { - if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP)) + if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.ssap)) return false; - if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP)) + if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.dsap)) return false; } if (info->bitmask & EBT_802_3_TYPE) { if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE)) return false; - if (FWINV(info->type != type, EBT_802_3_TYPE)) + if (NF_INVF(info, EBT_802_3_TYPE, info->type != type)) return false; } diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c index cd457b891..227142282 100644 --- a/net/bridge/netfilter/ebt_arp.c +++ b/net/bridge/netfilter/ebt_arp.c @@ -25,14 +25,14 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) return false; - if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode != - ah->ar_op, EBT_ARP_OPCODE)) + if ((info->bitmask & EBT_ARP_OPCODE) && + NF_INVF(info, EBT_ARP_OPCODE, info->opcode != ah->ar_op)) return false; - if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype != - ah->ar_hrd, EBT_ARP_HTYPE)) + if ((info->bitmask & EBT_ARP_HTYPE) && + NF_INVF(info, EBT_ARP_HTYPE, info->htype != ah->ar_hrd)) return false; - if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype != - ah->ar_pro, EBT_ARP_PTYPE)) + if ((info->bitmask & EBT_ARP_PTYPE) && + NF_INVF(info, EBT_ARP_PTYPE, info->ptype != ah->ar_pro)) return false; if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) { @@ -51,21 +51,22 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) sizeof(daddr), &daddr); if (dap == NULL) return false; - if (info->bitmask & EBT_ARP_SRC_IP && - FWINV(info->saddr != (*sap & info->smsk), EBT_ARP_SRC_IP)) + if ((info->bitmask & EBT_ARP_SRC_IP) && + NF_INVF(info, EBT_ARP_SRC_IP, + info->saddr != (*sap & info->smsk))) return false; - if (info->bitmask & EBT_ARP_DST_IP && - FWINV(info->daddr != (*dap & info->dmsk), EBT_ARP_DST_IP)) + if ((info->bitmask & EBT_ARP_DST_IP) && + NF_INVF(info, EBT_ARP_DST_IP, + info->daddr != (*dap & info->dmsk))) return false; - if (info->bitmask & EBT_ARP_GRAT && - FWINV(*dap != *sap, EBT_ARP_GRAT)) + if ((info->bitmask & EBT_ARP_GRAT) && + NF_INVF(info, EBT_ARP_GRAT, *dap != *sap)) return false; } if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) { const unsigned char *mp; unsigned char _mac[ETH_ALEN]; - uint8_t verdict, i; if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER)) return false; @@ -74,11 +75,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) sizeof(_mac), &_mac); if (mp == NULL) return false; - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (mp[i] ^ info->smaddr[i]) & - info->smmsk[i]; - if (FWINV(verdict != 0, EBT_ARP_SRC_MAC)) + if (NF_INVF(info, EBT_ARP_SRC_MAC, + !ether_addr_equal_masked(mp, info->smaddr, + info->smmsk))) return false; } @@ -88,11 +87,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) sizeof(_mac), &_mac); if (mp == NULL) return false; - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (mp[i] ^ info->dmaddr[i]) & - info->dmmsk[i]; - if (FWINV(verdict != 0, EBT_ARP_DST_MAC)) + if (NF_INVF(info, EBT_ARP_DST_MAC, + !ether_addr_equal_masked(mp, info->dmaddr, + info->dmmsk))) return false; } } diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 23bca62d5..d06968bdf 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -36,19 +36,19 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) return false; - if (info->bitmask & EBT_IP_TOS && - FWINV(info->tos != ih->tos, EBT_IP_TOS)) + if ((info->bitmask & EBT_IP_TOS) && + NF_INVF(info, EBT_IP_TOS, info->tos != ih->tos)) return false; - if (info->bitmask & EBT_IP_SOURCE && - FWINV((ih->saddr & info->smsk) != - info->saddr, EBT_IP_SOURCE)) + if ((info->bitmask & EBT_IP_SOURCE) && + NF_INVF(info, EBT_IP_SOURCE, + (ih->saddr & info->smsk) != info->saddr)) return false; if ((info->bitmask & EBT_IP_DEST) && - FWINV((ih->daddr & info->dmsk) != - info->daddr, EBT_IP_DEST)) + NF_INVF(info, EBT_IP_DEST, + (ih->daddr & info->dmsk) != info->daddr)) return false; if (info->bitmask & EBT_IP_PROTO) { - if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO)) + if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; if (!(info->bitmask & EBT_IP_DPORT) && !(info->bitmask & EBT_IP_SPORT)) @@ -61,16 +61,16 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (info->bitmask & EBT_IP_DPORT) { u32 dst = ntohs(pptr->dst); - if (FWINV(dst < info->dport[0] || - dst > info->dport[1], - EBT_IP_DPORT)) + if (NF_INVF(info, EBT_IP_DPORT, + dst < info->dport[0] || + dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP_SPORT) { u32 src = ntohs(pptr->src); - if (FWINV(src < info->sport[0] || - src > info->sport[1], - EBT_IP_SPORT)) + if (NF_INVF(info, EBT_IP_SPORT, + src < info->sport[0] || + src > info->sport[1])) return false; } } diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 98de6e7fd..4617491be 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -45,15 +45,18 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); if (ih6 == NULL) return false; - if (info->bitmask & EBT_IP6_TCLASS && - FWINV(info->tclass != ipv6_get_dsfield(ih6), EBT_IP6_TCLASS)) + if ((info->bitmask & EBT_IP6_TCLASS) && + NF_INVF(info, EBT_IP6_TCLASS, + info->tclass != ipv6_get_dsfield(ih6))) return false; - if ((info->bitmask & EBT_IP6_SOURCE && - FWINV(ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk, - &info->saddr), EBT_IP6_SOURCE)) || - (info->bitmask & EBT_IP6_DEST && - FWINV(ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk, - &info->daddr), EBT_IP6_DEST))) + if (((info->bitmask & EBT_IP6_SOURCE) && + NF_INVF(info, EBT_IP6_SOURCE, + ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk, + &info->saddr))) || + ((info->bitmask & EBT_IP6_DEST) && + NF_INVF(info, EBT_IP6_DEST, + ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk, + &info->daddr)))) return false; if (info->bitmask & EBT_IP6_PROTO) { uint8_t nexthdr = ih6->nexthdr; @@ -63,7 +66,7 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off); if (offset_ph == -1) return false; - if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO)) + if (NF_INVF(info, EBT_IP6_PROTO, info->protocol != nexthdr)) return false; if (!(info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT | EBT_IP6_ICMP6))) @@ -76,22 +79,24 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (info->bitmask & EBT_IP6_DPORT) { u16 dst = ntohs(pptr->tcpudphdr.dst); - if (FWINV(dst < info->dport[0] || - dst > info->dport[1], EBT_IP6_DPORT)) + if (NF_INVF(info, EBT_IP6_DPORT, + dst < info->dport[0] || + dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP6_SPORT) { u16 src = ntohs(pptr->tcpudphdr.src); - if (FWINV(src < info->sport[0] || - src > info->sport[1], EBT_IP6_SPORT)) + if (NF_INVF(info, EBT_IP6_SPORT, + src < info->sport[0] || + src > info->sport[1])) return false; } if ((info->bitmask & EBT_IP6_ICMP6) && - FWINV(pptr->icmphdr.type < info->icmpv6_type[0] || - pptr->icmphdr.type > info->icmpv6_type[1] || - pptr->icmphdr.code < info->icmpv6_code[0] || - pptr->icmphdr.code > info->icmpv6_code[1], - EBT_IP6_ICMP6)) + NF_INVF(info, EBT_IP6_ICMP6, + pptr->icmphdr.type < info->icmpv6_type[0] || + pptr->icmphdr.type > info->icmpv6_type[1] || + pptr->icmphdr.code < info->icmpv6_code[0] || + pptr->icmphdr.code > info->icmpv6_code[1])) return false; } return true; diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 6b731e12e..3140eb912 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -17,24 +17,24 @@ #define BPDU_TYPE_TCN 0x80 struct stp_header { - uint8_t dsap; - uint8_t ssap; - uint8_t ctrl; - uint8_t pid; - uint8_t vers; - uint8_t type; + u8 dsap; + u8 ssap; + u8 ctrl; + u8 pid; + u8 vers; + u8 type; }; struct stp_config_pdu { - uint8_t flags; - uint8_t root[8]; - uint8_t root_cost[4]; - uint8_t sender[8]; - uint8_t port[2]; - uint8_t msg_age[2]; - uint8_t max_age[2]; - uint8_t hello_time[2]; - uint8_t forward_delay[2]; + u8 flags; + u8 root[8]; + u8 root_cost[4]; + u8 sender[8]; + u8 port[2]; + u8 msg_age[2]; + u8 max_age[2]; + u8 hello_time[2]; + u8 forward_delay[2]; }; #define NR16(p) (p[0] << 8 | p[1]) @@ -44,76 +44,73 @@ static bool ebt_filter_config(const struct ebt_stp_info *info, const struct stp_config_pdu *stpc) { const struct ebt_stp_config_info *c; - uint16_t v16; - uint32_t v32; - int verdict, i; + u16 v16; + u32 v32; c = &info->config; if ((info->bitmask & EBT_STP_FLAGS) && - FWINV(c->flags != stpc->flags, EBT_STP_FLAGS)) + NF_INVF(info, EBT_STP_FLAGS, c->flags != stpc->flags)) return false; if (info->bitmask & EBT_STP_ROOTPRIO) { v16 = NR16(stpc->root); - if (FWINV(v16 < c->root_priol || - v16 > c->root_priou, EBT_STP_ROOTPRIO)) + if (NF_INVF(info, EBT_STP_ROOTPRIO, + v16 < c->root_priol || v16 > c->root_priou)) return false; } if (info->bitmask & EBT_STP_ROOTADDR) { - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (stpc->root[2+i] ^ c->root_addr[i]) & - c->root_addrmsk[i]; - if (FWINV(verdict != 0, EBT_STP_ROOTADDR)) + if (NF_INVF(info, EBT_STP_ROOTADDR, + !ether_addr_equal_masked(&stpc->root[2], + c->root_addr, + c->root_addrmsk))) return false; } if (info->bitmask & EBT_STP_ROOTCOST) { v32 = NR32(stpc->root_cost); - if (FWINV(v32 < c->root_costl || - v32 > c->root_costu, EBT_STP_ROOTCOST)) + if (NF_INVF(info, EBT_STP_ROOTCOST, + v32 < c->root_costl || v32 > c->root_costu)) return false; } if (info->bitmask & EBT_STP_SENDERPRIO) { v16 = NR16(stpc->sender); - if (FWINV(v16 < c->sender_priol || - v16 > c->sender_priou, EBT_STP_SENDERPRIO)) + if (NF_INVF(info, EBT_STP_SENDERPRIO, + v16 < c->sender_priol || v16 > c->sender_priou)) return false; } if (info->bitmask & EBT_STP_SENDERADDR) { - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) & - c->sender_addrmsk[i]; - if (FWINV(verdict != 0, EBT_STP_SENDERADDR)) + if (NF_INVF(info, EBT_STP_SENDERADDR, + !ether_addr_equal_masked(&stpc->sender[2], + c->sender_addr, + c->sender_addrmsk))) return false; } if (info->bitmask & EBT_STP_PORT) { v16 = NR16(stpc->port); - if (FWINV(v16 < c->portl || - v16 > c->portu, EBT_STP_PORT)) + if (NF_INVF(info, EBT_STP_PORT, + v16 < c->portl || v16 > c->portu)) return false; } if (info->bitmask & EBT_STP_MSGAGE) { v16 = NR16(stpc->msg_age); - if (FWINV(v16 < c->msg_agel || - v16 > c->msg_ageu, EBT_STP_MSGAGE)) + if (NF_INVF(info, EBT_STP_MSGAGE, + v16 < c->msg_agel || v16 > c->msg_ageu)) return false; } if (info->bitmask & EBT_STP_MAXAGE) { v16 = NR16(stpc->max_age); - if (FWINV(v16 < c->max_agel || - v16 > c->max_ageu, EBT_STP_MAXAGE)) + if (NF_INVF(info, EBT_STP_MAXAGE, + v16 < c->max_agel || v16 > c->max_ageu)) return false; } if (info->bitmask & EBT_STP_HELLOTIME) { v16 = NR16(stpc->hello_time); - if (FWINV(v16 < c->hello_timel || - v16 > c->hello_timeu, EBT_STP_HELLOTIME)) + if (NF_INVF(info, EBT_STP_HELLOTIME, + v16 < c->hello_timel || v16 > c->hello_timeu)) return false; } if (info->bitmask & EBT_STP_FWDD) { v16 = NR16(stpc->forward_delay); - if (FWINV(v16 < c->forward_delayl || - v16 > c->forward_delayu, EBT_STP_FWDD)) + if (NF_INVF(info, EBT_STP_FWDD, + v16 < c->forward_delayl || v16 > c->forward_delayu)) return false; } return true; @@ -125,7 +122,7 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct ebt_stp_info *info = par->matchinfo; const struct stp_header *sp; struct stp_header _stph; - const uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; + const u8 header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph); if (sp == NULL) @@ -135,8 +132,8 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) if (memcmp(sp, header, sizeof(header))) return false; - if (info->bitmask & EBT_STP_TYPE && - FWINV(info->type != sp->type, EBT_STP_TYPE)) + if ((info->bitmask & EBT_STP_TYPE) && + NF_INVF(info, EBT_STP_TYPE, info->type != sp->type)) return false; if (sp->type == BPDU_TYPE_CONFIG && @@ -156,8 +153,8 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) static int ebt_stp_mt_check(const struct xt_mtchk_param *par) { const struct ebt_stp_info *info = par->matchinfo; - const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00}; - const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const u8 bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00}; + const u8 msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const struct ebt_entry *e = par->entryinfo; if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK || diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5a61f3541..0833c251a 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -121,7 +121,6 @@ ebt_dev_check(const char *entry, const struct net_device *device) return devname[i] != entry[i] && entry[i] != 1; } -#define FWINV2(bool, invflg) ((bool) ^ !!(e->invflags & invflg)) /* process standard matches */ static inline int ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, @@ -130,7 +129,6 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, const struct ethhdr *h = eth_hdr(skb); const struct net_bridge_port *p; __be16 ethproto; - int verdict, i; if (skb_vlan_tag_present(skb)) ethproto = htons(ETH_P_8021Q); @@ -138,38 +136,36 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, ethproto = h->h_proto; if (e->bitmask & EBT_802_3) { - if (FWINV2(eth_proto_is_802_3(ethproto), EBT_IPROTO)) + if (NF_INVF(e, EBT_IPROTO, eth_proto_is_802_3(ethproto))) return 1; } else if (!(e->bitmask & EBT_NOPROTO) && - FWINV2(e->ethproto != ethproto, EBT_IPROTO)) + NF_INVF(e, EBT_IPROTO, e->ethproto != ethproto)) return 1; - if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN)) + if (NF_INVF(e, EBT_IIN, ebt_dev_check(e->in, in))) return 1; - if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT)) + if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out))) return 1; /* rcu_read_lock()ed by nf_hook_slow */ if (in && (p = br_port_get_rcu(in)) != NULL && - FWINV2(ebt_dev_check(e->logical_in, p->br->dev), EBT_ILOGICALIN)) + NF_INVF(e, EBT_ILOGICALIN, + ebt_dev_check(e->logical_in, p->br->dev))) return 1; if (out && (p = br_port_get_rcu(out)) != NULL && - FWINV2(ebt_dev_check(e->logical_out, p->br->dev), EBT_ILOGICALOUT)) + NF_INVF(e, EBT_ILOGICALOUT, + ebt_dev_check(e->logical_out, p->br->dev))) return 1; if (e->bitmask & EBT_SOURCEMAC) { - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (h->h_source[i] ^ e->sourcemac[i]) & - e->sourcemsk[i]; - if (FWINV2(verdict != 0, EBT_ISOURCE)) + if (NF_INVF(e, EBT_ISOURCE, + !ether_addr_equal_masked(h->h_source, e->sourcemac, + e->sourcemsk))) return 1; } if (e->bitmask & EBT_DESTMAC) { - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (h->h_dest[i] ^ e->destmac[i]) & - e->destmsk[i]; - if (FWINV2(verdict != 0, EBT_IDEST)) + if (NF_INVF(e, EBT_IDEST, + !ether_addr_equal_masked(h->h_dest, e->destmac, + e->destmsk))) return 1; } return 0; @@ -372,6 +368,8 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) { + if (!IS_ERR(match)) + module_put(match->me); request_module("ebt_%s", m->u.name); match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); } diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 4b901d9f2..ad47a921b 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -86,6 +86,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, }; static const struct nft_expr_ops * diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 77f7e7a9e..0b77ffbc2 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -72,7 +72,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(dev), nskb); + br_forward(br_port_get_rcu(dev), nskb, false, true); } static void nft_reject_br_send_v4_unreach(struct net *net, @@ -140,7 +140,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(dev), nskb); + br_forward(br_port_get_rcu(dev), nskb, false, true); } static void nft_reject_br_send_v6_tcp_reset(struct net *net, @@ -174,7 +174,7 @@ static void nft_reject_br_send_v6_tcp_reset(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(dev), nskb); + br_forward(br_port_get_rcu(dev), nskb, false, true); } static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) @@ -255,7 +255,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(dev), nskb); + br_forward(br_port_get_rcu(dev), nskb, false, true); } static void nft_reject_bridge_eval(const struct nft_expr *expr, diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 67a4a36fe..3408ed51b 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/net/can/Makefile b/net/can/Makefile index cef49eb1f..10936754e 100644 --- a/net/can/Makefile +++ b/net/can/Makefile @@ -3,7 +3,8 @@ # obj-$(CONFIG_CAN) += can.o -can-y := af_can.o proc.o +can-y := af_can.o +can-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_CAN_RAW) += can-raw.o can-raw-y := raw.o diff --git a/net/can/af_can.c b/net/can/af_can.c index 166d43619..1108079d9 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -911,14 +911,14 @@ static __init int can_init(void) if (!rcv_cache) return -ENOMEM; - if (stats_timer) { + if (IS_ENABLED(CONFIG_PROC_FS)) { + if (stats_timer) { /* the statistics are updated every second (timer triggered) */ - setup_timer(&can_stattimer, can_stat_update, 0); - mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); - } else - can_stattimer.function = NULL; - - can_init_proc(); + setup_timer(&can_stattimer, can_stat_update, 0); + mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); + } + can_init_proc(); + } /* protocol register */ sock_register(&can_family_ops); @@ -933,10 +933,12 @@ static __exit void can_exit(void) { struct net_device *dev; - if (stats_timer) - del_timer_sync(&can_stattimer); + if (IS_ENABLED(CONFIG_PROC_FS)) { + if (stats_timer) + del_timer_sync(&can_stattimer); - can_remove_proc(); + can_remove_proc(); + } /* protocol unregister */ dev_remove_pack(&canfd_packet); diff --git a/net/can/bcm.c b/net/can/bcm.c index 6863310d6..8e999ffdf 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1,7 +1,7 @@ /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * - * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * Copyright (c) 2002-2016 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -67,27 +67,31 @@ */ #define MAX_NFRAMES 256 -/* use of last_frames[index].can_dlc */ +/* use of last_frames[index].flags */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ -#define BCM_CAN_DLC_MASK 0x0F /* clean private flags in can_dlc by masking */ +#define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) -#define CAN_BCM_VERSION CAN_VERSION +#define CAN_BCM_VERSION "20160617" MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp "); MODULE_ALIAS("can-proto-2"); -/* easy access to can_frame payload */ -static inline u64 GET_U64(const struct can_frame *cp) +/* + * easy access to the first 64 bit of can(fd)_frame payload. cp->data is + * 64 bit aligned so the offset has to be multiples of 8 which is ensured + * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler(). + */ +static inline u64 get_u64(const struct canfd_frame *cp, int offset) { - return *(u64 *)cp->data; + return *(u64 *)(cp->data + offset); } struct bcm_op { @@ -101,13 +105,14 @@ struct bcm_op { struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; + int cfsiz; u32 count; u32 nframes; u32 currframe; - struct can_frame *frames; - struct can_frame *last_frames; - struct can_frame sframe; - struct can_frame last_sframe; + struct canfd_frame *frames; + struct canfd_frame *last_frames; + struct canfd_frame sframe; + struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; }; @@ -136,7 +141,7 @@ static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); } -#define CFSIZ sizeof(struct can_frame) +#define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) @@ -183,43 +188,50 @@ static int bcm_proc_show(struct seq_file *m, void *v) if (!op->frames_abs) continue; - seq_printf(m, "rx_op: %03X %-5s ", - op->can_id, bcm_proc_getifname(ifname, op->ifindex)); - seq_printf(m, "[%u]%c ", op->nframes, - (op->flags & RX_CHECK_DLC)?'d':' '); + seq_printf(m, "rx_op: %03X %-5s ", op->can_id, + bcm_proc_getifname(ifname, op->ifindex)); + + if (op->flags & CAN_FD_FRAME) + seq_printf(m, "(%u)", op->nframes); + else + seq_printf(m, "[%u]", op->nframes); + + seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' '); + if (op->kt_ival1.tv64) seq_printf(m, "timeo=%lld ", - (long long) - ktime_to_us(op->kt_ival1)); + (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2.tv64) seq_printf(m, "thr=%lld ", - (long long) - ktime_to_us(op->kt_ival2)); + (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# recv %ld (%ld) => reduction: ", - op->frames_filtered, op->frames_abs); + op->frames_filtered, op->frames_abs); reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; seq_printf(m, "%s%ld%%\n", - (reduction == 100)?"near ":"", reduction); + (reduction == 100) ? "near " : "", reduction); } list_for_each_entry(op, &bo->tx_ops, list) { - seq_printf(m, "tx_op: %03X %s [%u] ", - op->can_id, - bcm_proc_getifname(ifname, op->ifindex), - op->nframes); + seq_printf(m, "tx_op: %03X %s ", op->can_id, + bcm_proc_getifname(ifname, op->ifindex)); + + if (op->flags & CAN_FD_FRAME) + seq_printf(m, "(%u) ", op->nframes); + else + seq_printf(m, "[%u] ", op->nframes); if (op->kt_ival1.tv64) seq_printf(m, "t1=%lld ", - (long long) ktime_to_us(op->kt_ival1)); + (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2.tv64) seq_printf(m, "t2=%lld ", - (long long) ktime_to_us(op->kt_ival2)); + (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# sent %ld\n", op->frames_abs); } @@ -248,7 +260,7 @@ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; - struct can_frame *cf = &op->frames[op->currframe]; + struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; /* no target device? => exit */ if (!op->ifindex) @@ -260,7 +272,7 @@ static void bcm_can_tx(struct bcm_op *op) return; } - skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), gfp_any()); + skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); if (!skb) goto out; @@ -268,7 +280,7 @@ static void bcm_can_tx(struct bcm_op *op) can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; - memcpy(skb_put(skb, CFSIZ), cf, CFSIZ); + memcpy(skb_put(skb, op->cfsiz), cf, op->cfsiz); /* send with loopback */ skb->dev = dev; @@ -282,7 +294,7 @@ static void bcm_can_tx(struct bcm_op *op) /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; - out: +out: dev_put(dev); } @@ -291,13 +303,13 @@ static void bcm_can_tx(struct bcm_op *op) * (consisting of bcm_msg_head + x CAN frames) */ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, - struct can_frame *frames, int has_timestamp) + struct canfd_frame *frames, int has_timestamp) { struct sk_buff *skb; - struct can_frame *firstframe; + struct canfd_frame *firstframe; struct sockaddr_can *addr; struct sock *sk = op->sk; - unsigned int datalen = head->nframes * CFSIZ; + unsigned int datalen = head->nframes * op->cfsiz; int err; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); @@ -307,19 +319,19 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, memcpy(skb_put(skb, sizeof(*head)), head, sizeof(*head)); if (head->nframes) { - /* can_frames starting here */ - firstframe = (struct can_frame *)skb_tail_pointer(skb); + /* CAN frames starting here */ + firstframe = (struct canfd_frame *)skb_tail_pointer(skb); memcpy(skb_put(skb, datalen), frames, datalen); /* - * the BCM uses the can_dlc-element of the can_frame + * the BCM uses the flags-element of the canfd_frame * structure for internal purposes. This is only * relevant for updates that are generated by the * BCM, where nframes is 1 */ if (head->nframes == 1) - firstframe->can_dlc &= BCM_CAN_DLC_MASK; + firstframe->flags &= BCM_CAN_FLAGS_MASK; } if (has_timestamp) { @@ -406,7 +418,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) /* * bcm_rx_changed - create a RX_CHANGED notification due to changed content */ -static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) +static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) { struct bcm_msg_head head; @@ -418,7 +430,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ - data->can_dlc &= (BCM_CAN_DLC_MASK|RX_RECV); + data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV); head.opcode = RX_CHANGED; head.flags = op->flags; @@ -437,13 +449,13 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) * 2. send a notification to the user (if possible) */ static void bcm_rx_update_and_send(struct bcm_op *op, - struct can_frame *lastdata, - const struct can_frame *rxdata) + struct canfd_frame *lastdata, + const struct canfd_frame *rxdata) { - memcpy(lastdata, rxdata, CFSIZ); + memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ - lastdata->can_dlc |= (RX_RECV|RX_THR); + lastdata->flags |= (RX_RECV|RX_THR); /* throttling mode inactive ? */ if (!op->kt_ival2.tv64) { @@ -481,33 +493,36 @@ rx_changed_settime: * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, - const struct can_frame *rxdata) + const struct canfd_frame *rxdata) { + struct canfd_frame *cf = op->frames + op->cfsiz * index; + struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; + int i; + /* - * no one uses the MSBs of can_dlc for comparison, + * no one uses the MSBs of flags for comparison, * so we use it here to detect the first time of reception */ - if (!(op->last_frames[index].can_dlc & RX_RECV)) { + if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ - bcm_rx_update_and_send(op, &op->last_frames[index], rxdata); + bcm_rx_update_and_send(op, lcf, rxdata); return; } - /* do a real check in can_frame data section */ - - if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) != - (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) { - bcm_rx_update_and_send(op, &op->last_frames[index], rxdata); - return; + /* do a real check in CAN frame data section */ + for (i = 0; i < rxdata->len; i += 8) { + if ((get_u64(cf, i) & get_u64(rxdata, i)) != + (get_u64(cf, i) & get_u64(lcf, i))) { + bcm_rx_update_and_send(op, lcf, rxdata); + return; + } } if (op->flags & RX_CHECK_DLC) { - /* do a real check in can_frame dlc */ - if (rxdata->can_dlc != (op->last_frames[index].can_dlc & - BCM_CAN_DLC_MASK)) { - bcm_rx_update_and_send(op, &op->last_frames[index], - rxdata); + /* do a real check in CAN frame length */ + if (rxdata->len != lcf->len) { + bcm_rx_update_and_send(op, lcf, rxdata); return; } } @@ -556,8 +571,8 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) /* if user wants to be informed, when cyclic CAN-Messages come back */ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { - /* clear received can_frames to indicate 'nothing received' */ - memset(op->last_frames, 0, op->nframes * CFSIZ); + /* clear received CAN frames to indicate 'nothing received' */ + memset(op->last_frames, 0, op->nframes * op->cfsiz); } return HRTIMER_NORESTART; @@ -569,9 +584,11 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) static inline int bcm_rx_do_flush(struct bcm_op *op, int update, unsigned int index) { - if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) { + struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; + + if ((op->last_frames) && (lcf->flags & RX_THR)) { if (update) - bcm_rx_changed(op, &op->last_frames[index]); + bcm_rx_changed(op, lcf); return 1; } return 0; @@ -636,15 +653,19 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) static void bcm_rx_handler(struct sk_buff *skb, void *data) { struct bcm_op *op = (struct bcm_op *)data; - const struct can_frame *rxframe = (struct can_frame *)skb->data; + const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; - /* disable timeout */ - hrtimer_cancel(&op->timer); - if (op->can_id != rxframe->can_id) return; + /* make sure to handle the correct frame type (CAN / CAN FD) */ + if (skb->len != op->cfsiz) + return; + + /* disable timeout */ + hrtimer_cancel(&op->timer); + /* save rx timestamp */ op->rx_stamp = skb->tstamp; /* save originator for recvfrom() */ @@ -675,13 +696,14 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) * multiplex compare * * find the first multiplex mask that fits. - * Remark: The MUX-mask is stored in index 0 + * Remark: The MUX-mask is stored in index 0 - but only the + * first 64 bits of the frame data[] are relevant (CAN FD) */ for (i = 1; i < op->nframes; i++) { - if ((GET_U64(&op->frames[0]) & GET_U64(rxframe)) == - (GET_U64(&op->frames[0]) & - GET_U64(&op->frames[i]))) { + if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == + (get_u64(op->frames, 0) & + get_u64(op->frames + op->cfsiz * i, 0))) { bcm_rx_cmp_to_index(op, i, rxframe); break; } @@ -695,13 +717,14 @@ rx_starttimer: /* * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements */ -static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id, - int ifindex) +static struct bcm_op *bcm_find_op(struct list_head *ops, + struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op; list_for_each_entry(op, ops, list) { - if ((op->can_id == can_id) && (op->ifindex == ifindex)) + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && + (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) return op; } @@ -744,12 +767,14 @@ static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) /* * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) */ -static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex) +static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, + int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { - if ((op->can_id == can_id) && (op->ifindex == ifindex)) { + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && + (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { /* * Don't care if we're bound or not (due to netdev @@ -789,12 +814,14 @@ static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex) /* * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) */ -static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex) +static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, + int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { - if ((op->can_id == can_id) && (op->ifindex == ifindex)) { + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && + (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { list_del(&op->list); bcm_remove_op(op); return 1; /* done */ @@ -810,7 +837,7 @@ static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex) static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, int ifindex) { - struct bcm_op *op = bcm_find_op(ops, msg_head->can_id, ifindex); + struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex); if (!op) return -EINVAL; @@ -835,6 +862,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; + struct canfd_frame *cf; unsigned int i; int err; @@ -842,39 +870,46 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!ifindex) return -ENODEV; - /* check nframes boundaries - we need at least one can_frame */ + /* check nframes boundaries - we need at least one CAN frame */ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; /* check the given can_id */ - op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex); - + op = bcm_find_op(&bo->tx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* - * Do we need more space for the can_frames than currently + * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; - /* update can_frames content */ + /* update CAN frames content */ for (i = 0; i < msg_head->nframes; i++) { - err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ); - if (op->frames[i].can_dlc > 8) - err = -EINVAL; + cf = op->frames + op->cfsiz * i; + err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); + + if (op->flags & CAN_FD_FRAME) { + if (cf->len > 64) + err = -EINVAL; + } else { + if (cf->len > 8) + err = -EINVAL; + } if (err < 0) return err; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ - op->frames[i].can_id = msg_head->can_id; + cf->can_id = msg_head->can_id; } } + op->flags = msg_head->flags; } else { /* insert new BCM operation for the given can_id */ @@ -883,11 +918,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; - op->can_id = msg_head->can_id; + op->can_id = msg_head->can_id; + op->cfsiz = CFSIZ(msg_head->flags); + op->flags = msg_head->flags; - /* create array for can_frames and copy the data */ + /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { - op->frames = kmalloc(msg_head->nframes * CFSIZ, + op->frames = kmalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); @@ -897,10 +934,17 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->frames = &op->sframe; for (i = 0; i < msg_head->nframes; i++) { - err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ); - if (op->frames[i].can_dlc > 8) - err = -EINVAL; + cf = op->frames + op->cfsiz * i; + err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); + + if (op->flags & CAN_FD_FRAME) { + if (cf->len > 64) + err = -EINVAL; + } else { + if (cf->len > 8) + err = -EINVAL; + } if (err < 0) { if (op->frames != &op->sframe) @@ -911,7 +955,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ - op->frames[i].can_id = msg_head->can_id; + cf->can_id = msg_head->can_id; } } @@ -946,8 +990,6 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* check flags */ - op->flags = msg_head->flags; - if (op->flags & TX_RESET_MULTI_IDX) { /* start multiple frame transmission with index 0 */ op->currframe = 0; @@ -968,7 +1010,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (op->flags & STARTTIMER) { hrtimer_cancel(&op->timer); - /* spec: send can_frame when starting timer */ + /* spec: send CAN frame when starting timer */ op->flags |= TX_ANNOUNCE; } @@ -981,7 +1023,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (op->flags & STARTTIMER) bcm_tx_start_timer(op); - return msg_head->nframes * CFSIZ + MHSIZ; + return msg_head->nframes * op->cfsiz + MHSIZ; } /* @@ -1012,12 +1054,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, return -EINVAL; /* check the given can_id */ - op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex); + op = bcm_find_op(&bo->rx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* - * Do we need more space for the can_frames than currently + * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ @@ -1025,17 +1067,18 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, return -E2BIG; if (msg_head->nframes) { - /* update can_frames content */ + /* update CAN frames content */ err = memcpy_from_msg((u8 *)op->frames, msg, - msg_head->nframes * CFSIZ); + msg_head->nframes * op->cfsiz); if (err < 0) return err; /* clear last_frames to indicate 'nothing received' */ - memset(op->last_frames, 0, msg_head->nframes * CFSIZ); + memset(op->last_frames, 0, msg_head->nframes * op->cfsiz); } op->nframes = msg_head->nframes; + op->flags = msg_head->flags; /* Only an update -> do not call can_rx_register() */ do_rx_register = 0; @@ -1046,20 +1089,22 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; - op->can_id = msg_head->can_id; - op->nframes = msg_head->nframes; + op->can_id = msg_head->can_id; + op->nframes = msg_head->nframes; + op->cfsiz = CFSIZ(msg_head->flags); + op->flags = msg_head->flags; if (msg_head->nframes > 1) { - /* create array for can_frames and copy the data */ - op->frames = kmalloc(msg_head->nframes * CFSIZ, + /* create array for CAN frames and copy the data */ + op->frames = kmalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } - /* create and init array for received can_frames */ - op->last_frames = kzalloc(msg_head->nframes * CFSIZ, + /* create and init array for received CAN frames */ + op->last_frames = kzalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->last_frames) { kfree(op->frames); @@ -1074,7 +1119,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->nframes) { err = memcpy_from_msg((u8 *)op->frames, msg, - msg_head->nframes * CFSIZ); + msg_head->nframes * op->cfsiz); if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); @@ -1116,7 +1161,6 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ /* check flags */ - op->flags = msg_head->flags; if (op->flags & RX_RTR_FRAME) { @@ -1188,13 +1232,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } } - return msg_head->nframes * CFSIZ + MHSIZ; + return msg_head->nframes * op->cfsiz + MHSIZ; } /* * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) */ -static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) +static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, + int cfsiz) { struct sk_buff *skb; struct net_device *dev; @@ -1204,13 +1249,13 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) if (!ifindex) return -ENODEV; - skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), GFP_KERNEL); + skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); if (!skb) return -ENOMEM; can_skb_reserve(skb); - err = memcpy_from_msg(skb_put(skb, CFSIZ), msg, CFSIZ); + err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { kfree_skb(skb); return err; @@ -1232,7 +1277,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) if (err) return err; - return CFSIZ + MHSIZ; + return cfsiz + MHSIZ; } /* @@ -1244,13 +1289,23 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct bcm_sock *bo = bcm_sk(sk); int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ struct bcm_msg_head msg_head; + int cfsiz; int ret; /* read bytes or error codes as return value */ if (!bo->bound) return -ENOTCONN; /* check for valid message length from userspace */ - if (size < MHSIZ || (size - MHSIZ) % CFSIZ) + if (size < MHSIZ) + return -EINVAL; + + /* read message head information */ + ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); + if (ret < 0) + return ret; + + cfsiz = CFSIZ(msg_head.flags); + if ((size - MHSIZ) % cfsiz) return -EINVAL; /* check for alternative ifindex for this bcm_op */ @@ -1284,12 +1339,6 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } } - /* read message head information */ - - ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); - if (ret < 0) - return ret; - lock_sock(sk); switch (msg_head.opcode) { @@ -1303,14 +1352,14 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) break; case TX_DELETE: - if (bcm_delete_tx_op(&bo->tx_ops, msg_head.can_id, ifindex)) + if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case RX_DELETE: - if (bcm_delete_rx_op(&bo->rx_ops, msg_head.can_id, ifindex)) + if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; @@ -1329,11 +1378,11 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) break; case TX_SEND: - /* we need exactly one can_frame behind the msg head */ - if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ)) + /* we need exactly one CAN frame behind the msg head */ + if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ)) ret = -EINVAL; else - ret = bcm_tx_send(msg, ifindex, sk); + ret = bcm_tx_send(msg, ifindex, sk, cfsiz); break; default: diff --git a/net/can/proc.c b/net/can/proc.c index 1a19b985a..85ef7bb0f 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -517,8 +517,7 @@ void can_init_proc(void) can_dir = proc_mkdir("can", init_net.proc_net); if (!can_dir) { - printk(KERN_INFO "can: failed to create /proc/net/can . " - "CONFIG_PROC_FS missing?\n"); + pr_info("can: failed to create /proc/net/can.\n"); return; } diff --git a/net/ceph/Makefile b/net/ceph/Makefile index 958d98569..84cbed630 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ - pagevec.o snapshot.o + pagevec.o snapshot.o string_table.o diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 55d2bfee1..bddfcf6f0 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -747,6 +747,8 @@ out: static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); + WARN_ON(!ceph_strings_empty()); + ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c index 41466ccb9..7d54e944d 100644 --- a/net/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -9,9 +9,9 @@ */ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) { - __u32 su = le32_to_cpu(layout->fl_stripe_unit); - __u32 sc = le32_to_cpu(layout->fl_stripe_count); - __u32 os = le32_to_cpu(layout->fl_object_size); + __u32 su = layout->stripe_unit; + __u32 sc = layout->stripe_count; + __u32 os = layout->object_size; /* stripe unit, object size must be non-zero, 64k increment */ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) @@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) return 1; } +void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit); + fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); + fl->object_size = le32_to_cpu(legacy->fl_object_size); + fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); + if (fl->pool_id == 0) + fl->pool_id = -1; +} +EXPORT_SYMBOL(ceph_file_layout_from_legacy); + +void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit); + legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count); + legacy->fl_object_size = cpu_to_le32(fl->object_size); + if (fl->pool_id >= 0) + legacy->fl_pg_pool = cpu_to_le32(fl->pool_id); + else + legacy->fl_pg_pool = 0; +} +EXPORT_SYMBOL(ceph_file_layout_to_legacy); int ceph_flags_to_mode(int flags) { diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index e77b04ca7..c62b2b029 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) seq_printf(s, "]/%d\t[", t->up.primary); for (i = 0; i < t->acting.size; i++) seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); - seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary, - t->target_oid.name_len, t->target_oid.name, t->flags); + seq_printf(s, "]/%d\t", t->acting.primary); + if (t->target_oloc.pool_ns) { + seq_printf(s, "%*pE/%*pE\t0x%x", + (int)t->target_oloc.pool_ns->len, + t->target_oloc.pool_ns->str, + t->target_oid.name_len, t->target_oid.name, t->flags); + } else { + seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len, + t->target_oid.name, t->flags); + } if (t->paused) seq_puts(s, "\tP"); } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 37c38a7fb..ef34a0271 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc) } const char *ceph_sub_str[] = { - [CEPH_SUB_MDSMAP] = "mdsmap", [CEPH_SUB_MONMAP] = "monmap", [CEPH_SUB_OSDMAP] = "osdmap", + [CEPH_SUB_FSMAP] = "fsmap.user", + [CEPH_SUB_MDSMAP] = "mdsmap", }; /* @@ -573,7 +574,7 @@ static void complete_generic_request(struct ceph_mon_generic_request *req) put_generic_request(req); } -void cancel_generic_request(struct ceph_mon_generic_request *req) +static void cancel_generic_request(struct ceph_mon_generic_request *req) { struct ceph_mon_client *monc = req->monc; struct ceph_mon_generic_request *lookup_req; @@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: + case CEPH_MSG_FS_MAP_USER: m = ceph_msg_new(type, front_len, GFP_NOFS, false); if (!m) return NULL; /* ENOMEM--return skip == 0 */ diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index ddec1c10a..aaed59a47 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -5,6 +5,7 @@ #include #include +#include #include static void *msgpool_alloc(gfp_t gfp_mask, void *arg) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index aee117f83..a97e7b506 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest, static void target_destroy(struct ceph_osd_request_target *t) { ceph_oid_destroy(&t->base_oid); + ceph_oloc_destroy(&t->base_oloc); ceph_oid_destroy(&t->target_oid); + ceph_oloc_destroy(&t->target_oloc); } /* @@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); +static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc) +{ + return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); +} + int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) { struct ceph_osd_client *osdc = req->r_osdc; @@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) int msg_size; WARN_ON(ceph_oid_empty(&req->r_base_oid)); + WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); /* create request message */ msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ - msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += CEPH_ENCODING_START_BLK_LEN + + ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pgid */ msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); @@ -932,7 +941,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { osd_req_op_init(req, which, opcode, 0); } else { - u32 object_size = le32_to_cpu(layout->fl_object_size); + u32 object_size = layout->object_size; u32 object_base = off - objoff; if (!(truncate_seq == 1 && truncate_size == -1ULL)) { if (truncate_size <= object_base) { @@ -948,7 +957,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } req->r_flags = flags; - req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); + req->r_base_oloc.pool = layout->pool_id; + req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); req->r_snapid = vino.snap; @@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) p += sizeof(req->r_replay_version); /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); + ceph_start_encoding(&p, 5, 4, + ceph_oloc_encoding_size(&req->r_t.target_oloc)); ceph_encode_64(&p, req->r_t.target_oloc.pool); ceph_encode_32(&p, -1); /* preferred */ ceph_encode_32(&p, 0); /* key len */ + if (req->r_t.target_oloc.pool_ns) + ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str, + req->r_t.target_oloc.pool_ns->len); + else + ceph_encode_32(&p, 0); /* pgid */ ceph_encode_8(&p, 1); @@ -2594,9 +2608,22 @@ static int ceph_oloc_decode(void **p, void *end, } if (struct_v >= 5) { + bool changed = false; + len = ceph_decode_32(p); if (len > 0) { - pr_warn("ceph_object_locator::nspace is set\n"); + ceph_decode_need(p, end, len, e_inval); + if (!oloc->pool_ns || + ceph_compare_string(oloc->pool_ns, *p, len)) + changed = true; + *p += len; + } else { + if (oloc->pool_ns) + changed = true; + } + if (changed) { + /* redirect changes namespace */ + pr_warn("ceph_object_locator::nspace is changed\n"); goto e_inval; } } @@ -2806,7 +2833,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) goto out_unlock_session; } + m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns; ret = decode_MOSDOpReply(msg, &m); + m.redirect.oloc.pool_ns = NULL; if (ret) { pr_err("failed to decode MOSDOpReply for tid %llu: %d\n", req->r_tid, ret); @@ -2835,7 +2864,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) unlink_request(osd, req); mutex_unlock(&osd->lock); - ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc); + /* + * Not ceph_oloc_copy() - changing pool_ns is not + * supported. + */ + req->r_t.target_oloc.pool = m.redirect.oloc.pool; req->r_flags |= CEPH_OSD_FLAG_REDIRECTED; req->r_tid = 0; __submit_request(req, false); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7e480bf75..d2436880b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1510,6 +1510,24 @@ bad: return ERR_PTR(err); } +void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + WARN_ON(!ceph_oloc_empty(dest)); + WARN_ON(dest->pool_ns); /* empty() only covers ->pool */ + + dest->pool = src->pool; + if (src->pool_ns) + dest->pool_ns = ceph_get_string(src->pool_ns); +} +EXPORT_SYMBOL(ceph_oloc_copy); + +void ceph_oloc_destroy(struct ceph_object_locator *oloc) +{ + ceph_put_string(oloc->pool_ns); +} +EXPORT_SYMBOL(ceph_oloc_destroy); + void ceph_oid_copy(struct ceph_object_id *dest, const struct ceph_object_id *src) { @@ -1770,9 +1788,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *ono, u64 *oxoff, u64 *oxlen) { - u32 osize = le32_to_cpu(layout->fl_object_size); - u32 su = le32_to_cpu(layout->fl_stripe_unit); - u32 sc = le32_to_cpu(layout->fl_stripe_count); + u32 osize = layout->object_size; + u32 su = layout->stripe_unit; + u32 sc = layout->stripe_count; u32 bl, stripeno, stripepos, objsetno; u32 su_per_object; u64 t, su_offset; @@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, if (!pi) return -ENOENT; - raw_pgid->pool = oloc->pool; - raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, - oid->name_len); - - dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, - raw_pgid->pool, raw_pgid->seed); + if (!oloc->pool_ns) { + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, + raw_pgid->pool, raw_pgid->seed); + } else { + char stack_buf[256]; + char *buf = stack_buf; + int nsl = oloc->pool_ns->len; + size_t total = nsl + 1 + oid->name_len; + + if (total > sizeof(stack_buf)) { + buf = kmalloc(total, GFP_NOIO); + if (!buf) + return -ENOMEM; + } + memcpy(buf, oloc->pool_ns->str, nsl); + buf[nsl] = '\037'; + memcpy(buf + nsl + 1, oid->name, oid->name_len); + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total); + if (buf != stack_buf) + kfree(buf); + dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__, + oid->name, nsl, oloc->pool_ns->str, + raw_pgid->pool, raw_pgid->seed); + } return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c new file mode 100644 index 000000000..22fb96efc --- /dev/null +++ b/net/ceph/string_table.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(string_tree_lock); +static struct rb_root string_tree = RB_ROOT; + +struct ceph_string *ceph_find_or_create_string(const char* str, size_t len) +{ + struct ceph_string *cs, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&string_tree_lock); + p = &string_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_string, node); + ret = ceph_compare_string(exist, str, len); + if (ret > 0) + p = &(*p)->rb_left; + else if (ret < 0) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist && !kref_get_unless_zero(&exist->kref)) { + rb_erase(&exist->node, &string_tree); + RB_CLEAR_NODE(&exist->node); + exist = NULL; + } + spin_unlock(&string_tree_lock); + if (exist) + return exist; + + cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS); + if (!cs) + return NULL; + + kref_init(&cs->kref); + cs->len = len; + memcpy(cs->str, str, len); + cs->str[len] = 0; + +retry: + exist = NULL; + parent = NULL; + p = &string_tree.rb_node; + spin_lock(&string_tree_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_string, node); + ret = ceph_compare_string(exist, str, len); + if (ret > 0) + p = &(*p)->rb_left; + else if (ret < 0) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + ret = 0; + if (!exist) { + rb_link_node(&cs->node, parent, p); + rb_insert_color(&cs->node, &string_tree); + } else if (!kref_get_unless_zero(&exist->kref)) { + rb_erase(&exist->node, &string_tree); + RB_CLEAR_NODE(&exist->node); + ret = -EAGAIN; + } + spin_unlock(&string_tree_lock); + if (ret == -EAGAIN) + goto retry; + + if (exist) { + kfree(cs); + cs = exist; + } + + return cs; +} +EXPORT_SYMBOL(ceph_find_or_create_string); + +void ceph_release_string(struct kref *ref) +{ + struct ceph_string *cs = container_of(ref, struct ceph_string, kref); + + spin_lock(&string_tree_lock); + if (!RB_EMPTY_NODE(&cs->node)) { + rb_erase(&cs->node, &string_tree); + RB_CLEAR_NODE(&cs->node); + } + spin_unlock(&string_tree_lock); + + kfree_rcu(cs, rcu); +} +EXPORT_SYMBOL(ceph_release_string); + +bool ceph_strings_empty(void) +{ + return RB_EMPTY_ROOT(&string_tree); +} diff --git a/net/core/dev.c b/net/core/dev.c index 97fb3da50..ea6312057 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -139,6 +140,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -196,7 +198,7 @@ static inline void dev_base_seq_inc(struct net *net) static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { - unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } @@ -2249,11 +2251,12 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues); */ int netif_get_num_default_rss_queues(void) { - return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); + return is_kdump_kernel() ? + 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); } EXPORT_SYMBOL(netif_get_num_default_rss_queues); -static inline void __netif_reschedule(struct Qdisc *q) +static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; @@ -2420,7 +2423,7 @@ EXPORT_SYMBOL(__skb_tx_hash); static void skb_warn_bad_offload(const struct sk_buff *skb) { - static const netdev_features_t null_features = 0; + static const netdev_features_t null_features; struct net_device *dev = skb->dev; const char *name = ""; @@ -3068,6 +3071,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); + struct sk_buff *to_free = NULL; bool contended; int rc; @@ -3075,7 +3079,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. - * This permits __QDISC___STATE_RUNNING owner to get the lock more + * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. */ contended = qdisc_is_running(q); @@ -3084,7 +3088,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - kfree_skb(skb); + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { @@ -3107,7 +3111,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - rc = q->enqueue(skb, q) & NET_XMIT_MASK; + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -3117,6 +3121,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } } spin_unlock(root_lock); + if (unlikely(to_free)) + kfree_skb_list(to_free); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; @@ -3142,8 +3148,6 @@ static void skb_update_prio(struct sk_buff *skb) DEFINE_PER_CPU(int, xmit_recursion); EXPORT_SYMBOL(xmit_recursion); -#define RECURSION_LIMIT 10 - /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in @@ -3386,8 +3390,8 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) int cpu = smp_processor_id(); /* ok because BHs are off */ if (txq->xmit_lock_owner != cpu) { - - if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + if (unlikely(__this_cpu_read(xmit_recursion) > + XMIT_RECURSION_LIMIT)) goto recursion_alert; skb = validate_xmit_skb(skb, dev); @@ -3898,22 +3902,14 @@ static void net_tx_action(struct softirq_action *h) head = head->next_sched; root_lock = qdisc_lock(q); - if (spin_trylock(root_lock)) { - smp_mb__before_atomic(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - qdisc_run(q); - spin_unlock(root_lock); - } else { - if (!test_bit(__QDISC_STATE_DEACTIVATED, - &q->state)) { - __netif_reschedule(q); - } else { - smp_mb__before_atomic(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - } - } + spin_lock(root_lock); + /* We need to make sure head->next_sched is read + * before clearing __QDISC_STATE_SCHED + */ + smp_mb__before_atomic(); + clear_bit(__QDISC_STATE_SCHED, &q->state); + qdisc_run(q); + spin_unlock(root_lock); } } } @@ -4993,7 +4989,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (test_bit(NAPI_STATE_SCHED, &napi->state)) { rc = napi->poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); if (rc == BUSY_POLL_BUDGET) { napi_complete_done(napi, rc); napi_schedule(napi); @@ -5149,7 +5145,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) work = 0; if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); - trace_napi_poll(n); + trace_napi_poll(n, work, weight); } WARN_ON_ONCE(work > weight); @@ -5465,6 +5461,52 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) } EXPORT_SYMBOL(netdev_lower_get_next); +/** + * netdev_all_lower_get_next - Get the next device from all lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's all lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour all lower + * list will remain unchanged. + */ +struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->all_adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_all_lower_get_next); + +/** + * netdev_all_lower_get_next_rcu - Get the next device from all + * lower neighbour list, RCU variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's all lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->all_adj_list.lower, + struct netdev_adjacent, list); + + return lower ? lower->dev : NULL; +} +EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); + /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU @@ -5935,7 +5977,7 @@ static void netdev_adjacent_add_links(struct net_device *dev) struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); @@ -5944,7 +5986,7 @@ static void netdev_adjacent_add_links(struct net_device *dev) } list_for_each_entry(iter, &dev->adj_list.lower, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); @@ -5960,7 +6002,7 @@ static void netdev_adjacent_del_links(struct net_device *dev) struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.lower); @@ -5969,7 +6011,7 @@ static void netdev_adjacent_del_links(struct net_device *dev) } list_for_each_entry(iter, &dev->adj_list.lower, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.upper); @@ -5985,7 +6027,7 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); @@ -5994,7 +6036,7 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) } list_for_each_entry(iter, &dev->adj_list.lower, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); @@ -6019,8 +6061,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev, EXPORT_SYMBOL(netdev_lower_dev_get_private); -int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(const struct net_device *dev)) +int dev_get_nest_level(struct net_device *dev) { struct net_device *lower = NULL; struct list_head *iter; @@ -6030,15 +6071,12 @@ int dev_get_nest_level(struct net_device *dev, ASSERT_RTNL(); netdev_for_each_lower_dev(dev, lower, iter) { - nest = dev_get_nest_level(lower, type_check); + nest = dev_get_nest_level(lower); if (max_nest < nest) max_nest = nest; } - if (type_check(dev)) - max_nest++; - - return max_nest; + return max_nest + 1; } EXPORT_SYMBOL(dev_get_nest_level); @@ -6062,6 +6100,50 @@ void netdev_lower_state_changed(struct net_device *lower_dev, } EXPORT_SYMBOL(netdev_lower_state_changed); +int netdev_default_l2upper_neigh_construct(struct net_device *dev, + struct neighbour *n) +{ + struct net_device *lower_dev, *stop_dev; + struct list_head *iter; + int err; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (!lower_dev->netdev_ops->ndo_neigh_construct) + continue; + err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n); + if (err) { + stop_dev = lower_dev; + goto rollback; + } + } + return 0; + +rollback: + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (lower_dev == stop_dev) + break; + if (!lower_dev->netdev_ops->ndo_neigh_destroy) + continue; + lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); + } + return err; +} +EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct); + +void netdev_default_l2upper_neigh_destroy(struct net_device *dev, + struct neighbour *n) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (!lower_dev->netdev_ops->ndo_neigh_destroy) + continue; + lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); + } +} +EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -6545,6 +6627,38 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); +/** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @fd: new program fd or negative value to clear + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, int fd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct bpf_prog *prog = NULL; + struct netdev_xdp xdp = {}; + int err; + + if (!ops->ndo_xdp) + return -EOPNOTSUPP; + if (fd >= 0) { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + xdp.command = XDP_SETUP_PROG; + xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); + if (err < 0 && prog) + bpf_prog_put(prog); + + return err; +} +EXPORT_SYMBOL(dev_change_xdp_fd); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace diff --git a/net/core/devlink.c b/net/core/devlink.c index 933e8d4d3..1b5063088 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -26,6 +26,10 @@ #include #include #include +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); static LIST_HEAD(devlink_list); @@ -1394,6 +1398,78 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, return -EOPNOTSUPP; } +static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, u16 mode) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + struct sk_buff *msg; + u16 mode; + int err; + + if (!ops || !ops->eswitch_mode_get) + return -EOPNOTSUPP; + + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, + info->snd_portid, info->snd_seq, 0, mode); + + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + u16 mode; + + if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) + return -EINVAL; + + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + + if (ops && ops->eswitch_mode_set) + return ops->eswitch_mode_set(devlink, mode); + return -EOPNOTSUPP; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -1407,6 +1483,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -1525,6 +1602,20 @@ static const struct genl_ops devlink_nl_ops[] = { DEVLINK_NL_FLAG_NEED_SB | DEVLINK_NL_FLAG_LOCK_PORTS, }, + { + .cmd = DEVLINK_CMD_ESWITCH_MODE_GET, + .doit = devlink_nl_cmd_eswitch_mode_get_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_ESWITCH_MODE_SET, + .doit = devlink_nl_cmd_eswitch_mode_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; /** diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 252e155c8..d6b3b5795 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -187,7 +187,8 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *locatio trace_drop_common(skb, location); } -static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) +static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, + int work, int budget) { struct dm_hw_stat_delta *new_stat; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f4034817d..977489820 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -89,6 +89,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 840acebbb..be4629c34 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -173,7 +173,8 @@ void fib_rules_unregister(struct fib_rules_ops *ops) EXPORT_SYMBOL_GPL(fib_rules_unregister); static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, - struct flowi *fl, int flags) + struct flowi *fl, int flags, + struct fib_lookup_arg *arg) { int ret = 0; @@ -189,6 +190,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; + if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -204,7 +208,7 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: - if (!fib_rule_match(rule, ops, fl, flags)) + if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { @@ -265,7 +269,50 @@ errout: return err; } -static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) +static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, + struct nlattr **tb, struct fib_rule *rule) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action != rule->action) + continue; + + if (r->table != rule->table) + continue; + + if (r->pref != rule->pref) + continue; + + if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (r->mark != rule->mark) + continue; + + if (r->mark_mask != rule->mark_mask) + continue; + + if (r->tun_id != rule->tun_id) + continue; + + if (r->fr_net != rule->fr_net) + continue; + + if (r->l3mdev != rule->l3mdev) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return 1; + } + return 0; +} + +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -336,6 +383,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_TUN_ID]) rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + if (tb[FRA_L3MDEV]) { +#ifdef CONFIG_NET_L3_MASTER_DEV + rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); + if (rule->l3mdev != 1) +#endif + goto errout_free; + } + rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); @@ -371,6 +426,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } else if (rule->action == FR_ACT_GOTO) goto errout_free; + if (rule->l3mdev && rule->table) + goto errout_free; + + if ((nlh->nlmsg_flags & NLM_F_EXCL) && + rule_exists(ops, frh, tb, rule)) { + err = -EEXIST; + goto errout_free; + } + err = ops->configure(rule, skb, frh, tb); if (err < 0) goto errout_free; @@ -424,8 +488,9 @@ errout: rules_ops_put(ops); return err; } +EXPORT_SYMBOL_GPL(fib_nl_newrule); -static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) +int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -483,6 +548,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) continue; + if (tb[FRA_L3MDEV] && + (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -536,6 +605,7 @@ errout: rules_ops_put(ops); return err; } +EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) @@ -607,7 +677,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && - nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD))) + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || + (rule->l3mdev && + nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/core/filter.c b/net/core/filter.c index bca32d63a..cb06aceb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -150,6 +150,12 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return raw_smp_processor_id(); } +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = __get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -748,6 +754,17 @@ static bool chk_code_allowed(u16 code_to_probe) return codes[code_to_probe]; } +static bool bpf_check_basics_ok(const struct sock_filter *filter, + unsigned int flen) +{ + if (filter == NULL) + return false; + if (flen == 0 || flen > BPF_MAXINSNS) + return false; + + return true; +} + /** * bpf_check_classic - verify socket filter code * @filter: filter to verify @@ -768,9 +785,6 @@ static int bpf_check_classic(const struct sock_filter *filter, bool anc_found; int pc; - if (flen == 0 || flen > BPF_MAXINSNS) - return -EINVAL; - /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; @@ -1065,7 +1079,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); @@ -1112,7 +1126,7 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, int err; /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); @@ -1207,7 +1221,6 @@ static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); - unsigned int bpf_fsize = bpf_prog_size(fprog->len); struct bpf_prog *prog; int err; @@ -1215,10 +1228,10 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return ERR_PTR(-EINVAL); - prog = bpf_prog_alloc(bpf_fsize, 0); + prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) return ERR_PTR(-ENOMEM); @@ -1288,21 +1301,10 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog; - if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); - prog = bpf_prog_get(ufd); - if (IS_ERR(prog)) - return prog; - - if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(prog); - return ERR_PTR(-EINVAL); - } - - return prog; + return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) @@ -1359,6 +1361,18 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, return err; } +static inline void bpf_push_mac_rcsum(struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); +} + +static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); +} + static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; @@ -1376,12 +1390,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) ptr = skb->data + offset; if (flags & BPF_F_RECOMPUTE_CSUM) - skb_postpull_rcsum(skb, ptr, len); + __skb_postpull_rcsum(skb, ptr, len, offset); memcpy(ptr, from, len); if (flags & BPF_F_RECOMPUTE_CSUM) - skb_postpush_rcsum(skb, ptr, len); + __skb_postpush_rcsum(skb, ptr, len, offset); if (flags & BPF_F_INVALIDATE_HASH) skb_clear_hash(skb); @@ -1569,9 +1583,33 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .arg5_type = ARG_ANYTHING, }; +static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) +{ + return dev_forward_skb(dev, skb); +} + +static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) +{ + int ret; + + if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + kfree_skb(skb); + return -ENETDOWN; + } + + skb->dev = dev; + + __this_cpu_inc(xmit_recursion); + ret = dev_queue_xmit(skb); + __this_cpu_dec(xmit_recursion); + + return ret; +} + static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { - struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; + struct sk_buff *skb = (struct sk_buff *) (long) r1; struct net_device *dev; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1581,19 +1619,14 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!dev)) return -EINVAL; - skb2 = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb2)) + skb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb)) return -ENOMEM; - if (flags & BPF_F_INGRESS) { - if (skb_at_tc_ingress(skb2)) - skb_postpush_rcsum(skb2, skb_mac_header(skb2), - skb2->mac_len); - return dev_forward_skb(dev, skb2); - } + bpf_push_mac_rcsum(skb); - skb2->dev = dev; - return dev_queue_xmit(skb2); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -1637,15 +1670,10 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - if (ri->flags & BPF_F_INGRESS) { - if (skb_at_tc_ingress(skb)) - skb_postpush_rcsum(skb, skb_mac_header(skb), - skb->mac_len); - return dev_forward_skb(dev, skb); - } + bpf_push_mac_rcsum(skb); - skb->dev = dev; - return dev_queue_xmit(skb); + return ri->flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static const struct bpf_func_proto bpf_redirect_proto = { @@ -1680,6 +1708,23 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* If skb_clear_hash() was called due to mangling, we can + * trigger SW recalculation here. Later access to hash + * can then use the inline skb->hash via context directly + * instead of calling this helper again. + */ + return skb_get_hash((struct sk_buff *) (unsigned long) r1); +} + +static const struct bpf_func_proto bpf_get_hash_recalc_proto = { + .func = bpf_get_hash_recalc, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; @@ -1690,7 +1735,10 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); + bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); + bpf_pull_mac_rcsum(skb); + bpf_compute_data_end(skb); return ret; } @@ -1710,7 +1758,10 @@ static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) struct sk_buff *skb = (struct sk_buff *) (long) r1; int ret; + bpf_push_mac_rcsum(skb); ret = skb_vlan_pop(skb); + bpf_pull_mac_rcsum(skb); + bpf_compute_data_end(skb); return ret; } @@ -1723,6 +1774,224 @@ const struct bpf_func_proto bpf_skb_vlan_pop_proto = { }; EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); +static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) +{ + /* Caller already did skb_cow() with len as headroom, + * so no need to do it here. + */ + skb_push(skb, len); + memmove(skb->data, skb->data + len, off); + memset(skb->data + off, 0, len); + + /* No skb_postpush_rcsum(skb, skb->data + off, len) + * needed here as it does not change the skb->csum + * result for checksum complete when summing over + * zeroed blocks. + */ + return 0; +} + +static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) +{ + /* skb_ensure_writable() is not needed here, as we're + * already working on an uncloned skb. + */ + if (unlikely(!pskb_may_pull(skb, off + len))) + return -ENOMEM; + + skb_postpull_rcsum(skb, skb->data + off, len); + memmove(skb->data + len, skb->data, off); + __skb_pull(skb, len); + + return 0; +} + +static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* There's no need for __skb_push()/__skb_pull() pair to + * get to the start of the mac header as we're guaranteed + * to always start from here under eBPF. + */ + ret = bpf_skb_generic_push(skb, off, len); + if (likely(!ret)) { + skb->mac_header -= len; + skb->network_header -= len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* Same here, __skb_push()/__skb_pull() pair not needed. */ + ret = bpf_skb_generic_pop(skb, off, len); + if (likely(!ret)) { + skb->mac_header += len; + skb->network_header += len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb->network_header - skb->mac_header; + int ret; + + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to + * be changed into SKB_GSO_TCPV6. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + } + + /* Due to IPv6 header, MSS needs to be downgraded. */ + skb_shinfo(skb)->gso_size -= len_diff; + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + skb->protocol = htons(ETH_P_IPV6); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb->network_header - skb->mac_header; + int ret; + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to + * be changed into SKB_GSO_TCPV4. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { + skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + } + + /* Due to IPv4 header, MSS can be upgraded. */ + skb_shinfo(skb)->gso_size += len_diff; + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + skb->protocol = htons(ETH_P_IP); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) +{ + __be16 from_proto = skb->protocol; + + if (from_proto == htons(ETH_P_IP) && + to_proto == htons(ETH_P_IPV6)) + return bpf_skb_proto_4_to_6(skb); + + if (from_proto == htons(ETH_P_IPV6) && + to_proto == htons(ETH_P_IP)) + return bpf_skb_proto_6_to_4(skb); + + return -ENOTSUPP; +} + +static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __be16 proto = (__force __be16) r2; + int ret; + + if (unlikely(flags)) + return -EINVAL; + + /* General idea is that this helper does the basic groundwork + * needed for changing the protocol, and eBPF program fills the + * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() + * and other helpers, rather than passing a raw buffer here. + * + * The rationale is to keep this minimal and without a need to + * deal with raw packet data. F.e. even if we would pass buffers + * here, the program still needs to call the bpf_lX_csum_replace() + * helpers anyway. Plus, this way we keep also separation of + * concerns, since f.e. bpf_skb_store_bytes() should only take + * care of stores. + * + * Currently, additional options and extension header space are + * not supported, but flags register is reserved so we can adapt + * that. For offloads, we mark packet as dodgy, so that headers + * need to be verified first. + */ + ret = bpf_skb_proto_xlat(skb, proto); + bpf_compute_data_end(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_proto_proto = { + .func = bpf_skb_change_proto, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u32 pkt_type = r2; + + /* We only allow a restricted subset to be changed for now. */ + if (unlikely(skb->pkt_type > PACKET_OTHERHOST || + pkt_type > PACKET_OTHERHOST)) + return -EINVAL; + + skb->pkt_type = pkt_type; + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_type_proto = { + .func = bpf_skb_change_type, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push) @@ -1731,6 +2000,8 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_store_bytes) return true; + if (func == bpf_skb_change_proto) + return true; if (func == bpf_l3_csum_replace) return true; if (func == bpf_l4_csum_replace) @@ -1739,6 +2010,47 @@ bool bpf_helper_changes_skb_data(void *func) return false; } +static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, + unsigned long off, unsigned long len) +{ + void *ptr = skb_header_pointer(skb, off, len, dst_buff); + + if (unlikely(!ptr)) + return len; + if (ptr != dst_buff) + memcpy(dst_buff, ptr, len); + + return 0; +} + +static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, + u64 meta_size) +{ + struct sk_buff *skb = (struct sk_buff *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + void *meta = (void *)(long) r4; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(skb_size > skb->len)) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, + bpf_skb_copy); +} + +static const struct bpf_func_proto bpf_skb_event_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; @@ -1970,6 +2282,40 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } +#ifdef CONFIG_SOCK_CGROUP_DATA +static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(long)r1; + struct bpf_map *map = (struct bpf_map *)(long)r2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + struct sock *sk; + u32 i = (u32)r3; + + sk = skb->sk; + if (!sk || !sk_fullsock(sk)) + return -ENOENT; + + if (unlikely(i >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[i]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); +} + +static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { + .func = bpf_skb_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; +#endif + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1983,7 +2329,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; + return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: @@ -2018,6 +2364,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; + case BPF_FUNC_skb_change_proto: + return &bpf_skb_change_proto_proto; + case BPF_FUNC_skb_change_type: + return &bpf_skb_change_type_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: @@ -2030,13 +2380,27 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: - return bpf_get_event_output_proto(); + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; +#endif default: return sk_filter_func_proto(func_id); } } +static const struct bpf_func_proto * +xdp_func_proto(enum bpf_func_id func_id) +{ + return sk_filter_func_proto(func_id); +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2104,6 +2468,44 @@ static bool tc_cls_act_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool __is_valid_xdp_access(int off, int size, + enum bpf_access_type type) +{ + if (off < 0 || off >= sizeof(struct xdp_md)) + return false; + if (off % size != 0) + return false; + if (size != 4) + return false; + + return true; +} + +static bool xdp_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct xdp_md, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct xdp_md, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_xdp_access(off, size, type); +} + +void bpf_warn_invalid_xdp_action(u32 act) +{ + WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); + static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2255,6 +2657,29 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct xdp_md, data): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), + dst_reg, src_reg, + offsetof(struct xdp_buff, data)); + break; + case offsetof(struct xdp_md, data_end): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), + dst_reg, src_reg, + offsetof(struct xdp_buff, data_end)); + break; + } + + return insn - insn_buf; +} + static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -2267,6 +2692,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .convert_ctx_access = bpf_net_convert_ctx_access, }; +static const struct bpf_verifier_ops xdp_ops = { + .get_func_proto = xdp_func_proto, + .is_valid_access = xdp_is_valid_access, + .convert_ctx_access = xdp_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -2282,11 +2713,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = { .type = BPF_PROG_TYPE_SCHED_ACT, }; +static struct bpf_prog_type_list xdp_type __read_mostly = { + .ops = &xdp_ops, + .type = BPF_PROG_TYPE_XDP, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); + bpf_register_prog_type(&xdp_type); return 0; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 61ad43f61..52742a028 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -680,11 +680,13 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; + u32 hash; __flow_hash_secret_init(); - __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd), - flow_keys_have_l4(&keys)); + hash = ___skb_get_hash(skb, &keys, hashrnd); + + __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 4573d8109..cad8e791f 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -84,6 +84,7 @@ struct gen_estimator struct gnet_stats_basic_packed *bstats; struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; + seqcount_t *running; int ewma_log; u32 last_packets; unsigned long avpps; @@ -121,26 +122,28 @@ static void est_timer(unsigned long arg) unsigned long rate; u64 brate; - spin_lock(e->stats_lock); + if (e->stats_lock) + spin_lock(e->stats_lock); read_lock(&est_lock); if (e->bstats == NULL) goto skip; - __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats); + __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats); brate = (b.bytes - e->last_bytes)<<(7 - idx); e->last_bytes = b.bytes; e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); - e->rate_est->bps = (e->avbps+0xF)>>5; + WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5); rate = b.packets - e->last_packets; rate <<= (7 - idx); e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); - e->rate_est->pps = (e->avpps + 0xF) >> 5; + WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5); skip: read_unlock(&est_lock); - spin_unlock(e->stats_lock); + if (e->stats_lock) + spin_unlock(e->stats_lock); } if (!list_empty(&elist[idx].list)) @@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock + * @running: qdisc running seqcount * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est @@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, + seqcount_t *running, struct nlattr *opt) { struct gen_estimator *est; @@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (est == NULL) return -ENOBUFS; - __gnet_stats_copy_basic(&b, cpu_bstats, bstats); + __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats); idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; + est->running = running; est->ewma_log = parm->ewma_log; est->last_bytes = b.bytes; est->avbps = rate_est->bps<<5; @@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock + * @running: qdisc running seqcount (might be NULL) * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling @@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, - spinlock_t *stats_lock, struct nlattr *opt) + spinlock_t *stats_lock, + seqcount_t *running, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index be873e4e3..508e05130 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) return 0; nla_put_failure: + if (d->lock) + spin_unlock_bh(d->lock); kfree(d->xstats); d->xstats = NULL; d->xstats_len = 0; - spin_unlock_bh(d->lock); return -1; } @@ -66,15 +67,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, { memset(d, 0, sizeof(*d)); - spin_lock_bh(lock); - d->lock = lock; if (type) d->tail = (struct nlattr *)skb_tail_pointer(skb); d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; d->padattr = padattr; - + if (lock) { + d->lock = lock; + spin_lock_bh(lock); + } if (d->tail) return gnet_stats_copy(d, type, NULL, 0, padattr); @@ -128,21 +130,29 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, } void -__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, +__gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) { + unsigned int seq; + if (cpu) { __gnet_stats_copy_basic_cpu(bstats, cpu); - } else { + return; + } + do { + if (running) + seq = read_seqcount_begin(running); bstats->bytes = b->bytes; bstats->packets = b->packets; - } + } while (running && read_seqcount_retry(running, seq)); } EXPORT_SYMBOL(__gnet_stats_copy_basic); /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics @@ -154,13 +164,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, +gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) { struct gnet_stats_basic_packed bstats = {0}; - __gnet_stats_copy_basic(&bstats, cpu, b); + __gnet_stats_copy_basic(running, &bstats, cpu, b); if (d->compat_tc_stats) { d->tc_stats.bytes = bstats.bytes; @@ -330,8 +341,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) return 0; err_out: + if (d->lock) + spin_unlock_bh(d->lock); d->xstats_len = 0; - spin_unlock_bh(d->lock); return -1; } EXPORT_SYMBOL(gnet_stats_copy_app); @@ -365,10 +377,11 @@ gnet_stats_finish_copy(struct gnet_dump *d) return -1; } + if (d->lock) + spin_unlock_bh(d->lock); kfree(d->xstats); d->xstats = NULL; d->xstats_len = 0; - spin_unlock_bh(d->lock); return 0; } EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 510cd62fc..cf26e04c4 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -473,7 +473,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, } if (dev->netdev_ops->ndo_neigh_construct) { - error = dev->netdev_ops->ndo_neigh_construct(n); + error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; @@ -701,7 +701,7 @@ void neigh_destroy(struct neighbour *neigh) neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) - dev->netdev_ops->ndo_neigh_destroy(neigh); + dev->netdev_ops->ndo_neigh_destroy(dev, neigh); dev_put(dev); neigh_parms_put(neigh->parms); @@ -1060,8 +1060,6 @@ static void neigh_update_hhs(struct neighbour *neigh) NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. - It also allows to retain current state - if lladdr is unchanged. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing @@ -1150,10 +1148,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } else goto out; } else { - if (lladdr == neigh->ha && new == NUD_STALE && - ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || - (old & NUD_CONNECTED)) - ) + if (lladdr == neigh->ha && new == NUD_STALE) new = old; } } @@ -2047,6 +2042,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, @@ -2930,6 +2926,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) return; set_bit(index, p->data_state); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7a0b61655..6e4f34721 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex); static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - dev->tx_queue_len = new_len; + int res, orig_len = dev->tx_queue_len; + + if (new_len != orig_len) { + dev->tx_queue_len = new_len; + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); + if (res) { + netdev_err(dev, + "refused to change device tx_queue_len\n"); + dev->tx_queue_len = orig_len; + return -EFAULT; + } + } + return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 94acfc89a..53599bd0c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -163,7 +163,7 @@ static void poll_one_napi(struct napi_struct *napi) */ work = napi->poll(napi, 0); WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll); - trace_napi_poll(napi); + trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8b02df0d3..bbd118b19 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -213,6 +213,7 @@ /* Xmit modes */ #define M_START_XMIT 0 /* Default normal TX */ #define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ #define if_lock(t) spin_lock(&(t->if_lock)); @@ -626,6 +627,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) seq_puts(seq, " xmit_mode: netif_receive\n"); + else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) + seq_puts(seq, " xmit_mode: xmit_queue\n"); seq_puts(seq, " Flags: "); @@ -1142,8 +1145,10 @@ static ssize_t pktgen_if_write(struct file *file, return len; i += len; - if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) && - (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + if ((value > 1) && + ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || + ((pkt_dev->xmit_mode == M_START_XMIT) && + (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) return -ENOTSUPP; pkt_dev->burst = value < 1 ? 1 : value; sprintf(pg_result, "OK: burst=%d", pkt_dev->burst); @@ -1198,6 +1203,9 @@ static ssize_t pktgen_if_write(struct file *file, * at module loading time */ pkt_dev->clone_skb = 0; + } else if (strcmp(f, "queue_xmit") == 0) { + pkt_dev->xmit_mode = M_QUEUE_XMIT; + pkt_dev->last_ok = 1; } else { sprintf(pg_result, "xmit_mode -:%s:- unknown\nAvailable modes: %s", @@ -3434,6 +3442,36 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) #endif } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ + } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { + local_bh_disable(); + atomic_inc(&pkt_dev->skb->users); + + ret = dev_queue_xmit(pkt_dev->skb); + switch (ret) { + case NET_XMIT_SUCCESS: + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + /* These are all valid return codes for a qdisc but + * indicate packets are being dropped or will likely + * be dropped soon. + */ + case NETDEV_TX_BUSY: + /* qdisc may call dev_hard_start_xmit directly in cases + * where no queues exist e.g. loopback device, virtual + * devices, etc. In this case we need to handle + * NETDEV_TX_ codes. + */ + default: + pkt_dev->errors++; + net_info_ratelimited("%s xmit error: %d\n", + pkt_dev->odevname, ret); + break; + } + goto out; } txq = skb_get_tx_queue(odev, pkt_dev->skb); @@ -3463,7 +3501,6 @@ xmit_more: break; case NET_XMIT_DROP: case NET_XMIT_CN: - case NET_XMIT_POLICED: /* skb has been consumed */ pkt_dev->errors++; break; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d69c4644f..189cc78c7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -71,9 +71,31 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +static struct sk_buff *defer_kfree_skb_list; +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) +{ + if (head && tail) { + tail->next = defer_kfree_skb_list; + defer_kfree_skb_list = head; + } +} +EXPORT_SYMBOL(rtnl_kfree_skbs); + void __rtnl_unlock(void) { + struct sk_buff *head = defer_kfree_skb_list; + + defer_kfree_skb_list = NULL; + mutex_unlock(&rtnl_mutex); + + while (head) { + struct sk_buff *next = head->next; + + kfree_skb(head); + cond_resched(); + head = next; + } } void rtnl_unlock(void) @@ -869,6 +891,16 @@ static size_t rtnl_port_size(const struct net_device *dev, return port_self_size; } +static size_t rtnl_xdp_size(const struct net_device *dev) +{ + size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */ + + if (!dev->netdev_ops->ndo_xdp) + return 0; + else + return xdp_size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -905,6 +937,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + + rtnl_xdp_size(dev) /* IFLA_XDP */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1189,6 +1222,33 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct netdev_xdp xdp_op = {}; + struct nlattr *xdp; + int err; + + if (!dev->netdev_ops->ndo_xdp) + return 0; + xdp = nla_nest_start(skb, IFLA_XDP); + if (!xdp) + return -EMSGSIZE; + xdp_op.command = XDP_QUERY_PROG; + err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); + if (err) + goto err_cancel; + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached); + if (err) + goto err_cancel; + + nla_nest_end(skb, xdp); + return 0; + +err_cancel: + nla_nest_cancel(skb, xdp); + return err; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -1285,6 +1345,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; + if (rtnl_xdp_fill(skb, dev)) + goto nla_put_failure; + if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; @@ -1370,6 +1433,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, + [IFLA_XDP] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1407,6 +1471,11 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { + [IFLA_XDP_FD] = { .type = NLA_S32 }, + [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, +}; + static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) { const struct rtnl_link_ops *ops = NULL; @@ -1905,11 +1974,19 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_TXQLEN]) { unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); - - if (dev->tx_queue_len ^ value) + unsigned long orig_len = dev->tx_queue_len; + + if (dev->tx_queue_len ^ value) { + dev->tx_queue_len = value; + err = call_netdevice_notifiers( + NETDEV_CHANGE_TX_QUEUE_LEN, dev); + err = notifier_to_errno(err); + if (err) { + dev->tx_queue_len = orig_len; + goto errout; + } status |= DO_SETLINK_NOTIFY; - - dev->tx_queue_len = value; + } } if (tb[IFLA_OPERSTATE]) @@ -2024,6 +2101,27 @@ static int do_setlink(const struct sk_buff *skb, status |= DO_SETLINK_NOTIFY; } + if (tb[IFLA_XDP]) { + struct nlattr *xdp[IFLA_XDP_MAX + 1]; + + err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], + ifla_xdp_policy); + if (err < 0) + goto errout; + + if (xdp[IFLA_XDP_ATTACHED]) { + err = -EINVAL; + goto errout; + } + if (xdp[IFLA_XDP_FD]) { + err = dev_change_xdp_fd(dev, + nla_get_s32(xdp[IFLA_XDP_FD])); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + } + errout: if (status & DO_SETLINK_MODIFIED) { if (status & DO_SETLINK_NOTIFY) @@ -3497,7 +3595,32 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (!attr) goto nla_put_failure; - err = ops->fill_linkxstats(skb, dev, prividx); + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, + *idxattr)) { + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + master = netdev_master_upper_dev_get(dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->fill_linkxstats) { + int err; + + *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; + attr = nla_nest_start(skb, + IFLA_STATS_LINK_XSTATS_SLAVE); + if (!attr) + goto nla_put_failure; + + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; @@ -3533,14 +3656,35 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { - size += nla_total_size(ops->get_linkxstats_size(dev)); + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { + struct net_device *_dev = (struct net_device *)dev; + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + /* netdev_master_upper_dev_get can't take const */ + master = netdev_master_upper_dev_get(_dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->get_linkxstats_size) { + int attr = IFLA_STATS_LINK_XSTATS_SLAVE; + + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); + /* for IFLA_STATS_LINK_XSTATS_SLAVE */ + size += nla_total_size(0); + } + } + return size; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index eb12d2161..3864b4b68 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #ifdef CONFIG_NET_CLS_ACT #include @@ -3098,9 +3099,13 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int hsize; int size; - len = head_skb->len - offset; - if (len > mss) - len = mss; + if (unlikely(mss == GSO_BY_FRAGS)) { + len = list_skb->len; + } else { + len = head_skb->len - offset; + if (len > mss) + len = mss; + } hsize = skb_headlen(head_skb) - offset; if (hsize < 0) @@ -3420,6 +3425,7 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } +EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { @@ -4360,6 +4366,8 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); + } else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) { + thlen = sizeof(struct sctphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already @@ -4369,6 +4377,38 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) } EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); +/** + * skb_gso_validate_mtu - Return in case such skb fits a given MTU + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU + * once split. + */ +bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + unsigned int hlen; + + hlen = skb_gso_network_seglen(skb); + + if (shinfo->gso_size != GSO_BY_FRAGS) + return hlen <= mtu; + + /* Undo this so we can re-use header sizes */ + hlen -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (hlen + skb_headlen(iter) > mtu) + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mtu); + static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { if (skb_cow(skb, skb_headroom(skb)) < 0) { diff --git a/net/core/utils.c b/net/core/utils.c index 3d17ca8b4..cf5622b9c 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -133,7 +133,7 @@ int in4_pton(const char *src, int srclen, s = src; d = dbuf; i = 0; - while(1) { + while (1) { int c; c = xdigit2bin(srclen > 0 ? *s : '\0', delim); if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { @@ -283,11 +283,11 @@ cont: i = 15; d--; if (dc) { - while(d >= dc) + while (d >= dc) dst[i--] = *d--; - while(i >= dc - dbuf) + while (i >= dc - dbuf) dst[i--] = 0; - while(i >= 0) + while (i >= 0) dst[i--] = *d--; } else memcpy(dst, dbuf, sizeof(dbuf)); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 3ff137d94..3828f94b2 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -216,14 +216,17 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req skb = dccp_make_response(sk, dst, req); if (skb != NULL) { struct dccp_hdr *dh = dccp_hdr(skb); + struct ipv6_txoptions *opt; dh->dccph_checksum = dccp_v6_csum_finish(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; rcu_read_lock(); - err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), - np->tclass); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); + err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -236,6 +239,7 @@ done: static void dccp_v6_reqsk_destructor(struct request_sock *req) { dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); + kfree(inet_rsk(req)->ipv6_opt); kfree_skb(inet_rsk(req)->pktopts); } @@ -494,7 +498,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ - opt = rcu_dereference(np->opt); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); diff --git a/net/dsa/Makefile b/net/dsa/Makefile index da06ed1df..8af4ded70 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o slave.o +dsa_core-y += dsa.o slave.o dsa2.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index eff5dfc2e..7e68bc6bc 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -29,6 +29,33 @@ char dsa_driver_version[] = "0.1"; +static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Just return the original SKB */ + return skb; +} + +static const struct dsa_device_ops none_ops = { + .xmit = dsa_slave_notag_xmit, + .rcv = NULL, +}; + +const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { +#ifdef CONFIG_NET_DSA_TAG_DSA + [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, +#endif + [DSA_TAG_PROTO_NONE] = &none_ops, +}; /* switch driver registration ***********************************************/ static DEFINE_MUTEX(dsa_switch_drivers_mutex); @@ -180,41 +207,100 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); #endif /* CONFIG_NET_DSA_HWMON */ /* basic switch operations **************************************************/ -static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) +int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, + struct device_node *port_dn, int port) { - struct dsa_chip_data *cd = ds->cd; - struct device_node *port_dn; struct phy_device *phydev; - int ret, port, mode; + int ret, mode; + + if (of_phy_is_fixed_link(port_dn)) { + ret = of_phy_register_fixed_link(port_dn); + if (ret) { + dev_err(dev, "failed to register fixed PHY\n"); + return ret; + } + phydev = of_phy_find_device(port_dn); + + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + phydev->interface = mode; + + genphy_config_init(phydev); + genphy_read_status(phydev); + if (ds->drv->adjust_link) + ds->drv->adjust_link(ds, port, phydev); + } + + return 0; +} + +static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) +{ + struct device_node *port_dn; + int ret, port; for (port = 0; port < DSA_MAX_PORTS; port++) { if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - port_dn = cd->port_dn[port]; - if (of_phy_is_fixed_link(port_dn)) { - ret = of_phy_register_fixed_link(port_dn); - if (ret) { - netdev_err(master, - "failed to register fixed PHY\n"); - return ret; - } - phydev = of_phy_find_device(port_dn); + port_dn = ds->ports[port].dn; + ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port); + if (ret) + return ret; + } + return 0; +} - mode = of_get_phy_mode(port_dn); - if (mode < 0) - mode = PHY_INTERFACE_MODE_NA; - phydev->interface = mode; +const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) +{ + const struct dsa_device_ops *ops; + + if (tag_protocol >= DSA_TAG_LAST) + return ERR_PTR(-EINVAL); + ops = dsa_device_ops[tag_protocol]; + + if (!ops) + return ERR_PTR(-ENOPROTOOPT); + + return ops; +} + +int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds) +{ + struct net_device *master; + struct ethtool_ops *cpu_ops; + + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + + cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL); + if (!cpu_ops) + return -ENOMEM; + + memcpy(&ds->dst->master_ethtool_ops, master->ethtool_ops, + sizeof(struct ethtool_ops)); + ds->dst->master_orig_ethtool_ops = master->ethtool_ops; + memcpy(cpu_ops, &ds->dst->master_ethtool_ops, + sizeof(struct ethtool_ops)); + dsa_cpu_port_ethtool_init(cpu_ops); + master->ethtool_ops = cpu_ops; - genphy_config_init(phydev); - genphy_read_status(phydev); - if (ds->drv->adjust_link) - ds->drv->adjust_link(ds, port, phydev); - } - } return 0; } +void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) +{ + struct net_device *master; + + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + + master->ethtool_ops = ds->dst->master_orig_ethtool_ops; +} + static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; @@ -243,6 +329,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) } dst->cpu_switch = index; dst->cpu_port = i; + ds->cpu_port_mask |= 1 << i; } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; } else { @@ -267,37 +354,17 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * switch. */ if (dst->cpu_switch == index) { - switch (drv->tag_protocol) { -#ifdef CONFIG_NET_DSA_TAG_DSA - case DSA_TAG_PROTO_DSA: - dst->rcv = dsa_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - case DSA_TAG_PROTO_EDSA: - dst->rcv = edsa_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - case DSA_TAG_PROTO_TRAILER: - dst->rcv = trailer_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM - case DSA_TAG_PROTO_BRCM: - dst->rcv = brcm_netdev_ops.rcv; - break; -#endif - case DSA_TAG_PROTO_NONE: - break; - default: - ret = -ENOPROTOOPT; + dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol); + if (IS_ERR(dst->tag_ops)) { + ret = PTR_ERR(dst->tag_ops); goto out; } - dst->tag_protocol = drv->tag_protocol; + dst->rcv = dst->tag_ops->rcv; } + memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); + /* * Do basic register setup. */ @@ -309,22 +376,25 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) if (ret < 0) goto out; - ds->slave_mii_bus = devm_mdiobus_alloc(parent); - if (ds->slave_mii_bus == NULL) { - ret = -ENOMEM; - goto out; - } - dsa_slave_mii_bus_init(ds); - - ret = mdiobus_register(ds->slave_mii_bus); - if (ret < 0) - goto out; + if (!ds->slave_mii_bus && drv->phy_read) { + ds->slave_mii_bus = devm_mdiobus_alloc(parent); + if (!ds->slave_mii_bus) { + ret = -ENOMEM; + goto out; + } + dsa_slave_mii_bus_init(ds); + ret = mdiobus_register(ds->slave_mii_bus); + if (ret < 0) + goto out; + } /* * Create network devices for physical switch ports. */ for (i = 0; i < DSA_MAX_PORTS; i++) { + ds->ports[i].dn = cd->port_dn[i]; + if (!(ds->enabled_port_mask & (1 << i))) continue; @@ -337,13 +407,17 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) } /* Perform configuration of the CPU and DSA ports */ - ret = dsa_cpu_dsa_setup(ds, dst->master_netdev); + ret = dsa_cpu_dsa_setups(ds, parent); if (ret < 0) { netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", index); ret = 0; } + ret = dsa_cpu_port_ethtool_setup(ds); + if (ret) + return ret; + #ifdef CONFIG_NET_DSA_HWMON /* If the switch provides a temperature sensor, * register with hardware monitoring subsystem. @@ -420,11 +494,21 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, return ds; } -static void dsa_switch_destroy(struct dsa_switch *ds) +void dsa_cpu_dsa_destroy(struct device_node *port_dn) { - struct device_node *port_dn; struct phy_device *phydev; - struct dsa_chip_data *cd = ds->cd; + + if (of_phy_is_fixed_link(port_dn)) { + phydev = of_phy_find_device(port_dn); + if (phydev) { + phy_device_free(phydev); + fixed_phy_unregister(phydev); + } + } +} + +static void dsa_switch_destroy(struct dsa_switch *ds) +{ int port; #ifdef CONFIG_NET_DSA_HWMON @@ -437,26 +521,25 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!(ds->enabled_port_mask & (1 << port))) continue; - if (!ds->ports[port]) + if (!ds->ports[port].netdev) continue; - dsa_slave_destroy(ds->ports[port]); + dsa_slave_destroy(ds->ports[port].netdev); } - /* Remove any fixed link PHYs */ + /* Disable configuration of the CPU and DSA ports */ for (port = 0; port < DSA_MAX_PORTS; port++) { - port_dn = cd->port_dn[port]; - if (of_phy_is_fixed_link(port_dn)) { - phydev = of_phy_find_device(port_dn); - if (phydev) { - phy_device_free(phydev); - of_node_put(port_dn); - fixed_phy_unregister(phydev); - } - } + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + dsa_cpu_dsa_destroy(ds->ports[port].dn); + + /* Clearing a bit which is not set does no harm */ + ds->cpu_port_mask |= ~(1 << port); + ds->dsa_port_mask |= ~(1 << port); } - mdiobus_unregister(ds->slave_mii_bus); + if (ds->slave_mii_bus && ds->drv->phy_read) + mdiobus_unregister(ds->slave_mii_bus); } #ifdef CONFIG_PM_SLEEP @@ -469,7 +552,7 @@ static int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i]); + ret = dsa_slave_suspend(ds->ports[i].netdev); if (ret) return ret; } @@ -495,7 +578,7 @@ static int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i]); + ret = dsa_slave_resume(ds->ports[i].netdev); if (ret) return ret; } @@ -587,17 +670,6 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, if (link_sw_addr >= pd->nr_chips) return -EINVAL; - /* First time routing table allocation */ - if (!cd->rtable) { - cd->rtable = kmalloc_array(pd->nr_chips, sizeof(s8), - GFP_KERNEL); - if (!cd->rtable) - return -ENOMEM; - - /* default to no valid uplink/downlink */ - memset(cd->rtable, -1, pd->nr_chips * sizeof(s8)); - } - cd->rtable[link_sw_addr] = port_index; return 0; @@ -639,7 +711,6 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) kfree(pd->chip[i].port_names[port_index]); port_index++; } - kfree(pd->chip[i].rtable); /* Drop our reference to the MDIO bus device */ if (pd->chip[i].host_dev) @@ -703,11 +774,17 @@ static int dsa_of_probe(struct device *dev) chip_index = -1; for_each_available_child_of_node(np, child) { + int i; + chip_index++; cd = &pd->chip[chip_index]; cd->of_node = child; + /* Initialize the routing table */ + for (i = 0; i < DSA_MAX_SWITCHES; ++i) + cd->rtable[i] = DSA_RTABLE_NONE; + /* When assigning the host device, increment its refcount */ cd->host_dev = get_device(&mdio_bus->dev); @@ -931,6 +1008,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } + dsa_cpu_port_ethtool_restore(dst->ds[0]); + dev_put(dst->master_netdev); } diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c new file mode 100644 index 000000000..f30bad967 --- /dev/null +++ b/net/dsa/dsa2.c @@ -0,0 +1,695 @@ +/* + * net/dsa/dsa2.c - Hardware switch handling, binding version 2 + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2013 Florian Fainelli + * Copyright (c) 2016 Andrew Lunn + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dsa_priv.h" + +static LIST_HEAD(dsa_switch_trees); +static DEFINE_MUTEX(dsa2_mutex); + +static struct dsa_switch_tree *dsa_get_dst(u32 tree) +{ + struct dsa_switch_tree *dst; + + list_for_each_entry(dst, &dsa_switch_trees, list) + if (dst->tree == tree) + return dst; + return NULL; +} + +static void dsa_free_dst(struct kref *ref) +{ + struct dsa_switch_tree *dst = container_of(ref, struct dsa_switch_tree, + refcount); + + list_del(&dst->list); + kfree(dst); +} + +static void dsa_put_dst(struct dsa_switch_tree *dst) +{ + kref_put(&dst->refcount, dsa_free_dst); +} + +static struct dsa_switch_tree *dsa_add_dst(u32 tree) +{ + struct dsa_switch_tree *dst; + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return NULL; + dst->tree = tree; + dst->cpu_switch = -1; + INIT_LIST_HEAD(&dst->list); + list_add_tail(&dsa_switch_trees, &dst->list); + kref_init(&dst->refcount); + + return dst; +} + +static void dsa_dst_add_ds(struct dsa_switch_tree *dst, + struct dsa_switch *ds, u32 index) +{ + kref_get(&dst->refcount); + dst->ds[index] = ds; +} + +static void dsa_dst_del_ds(struct dsa_switch_tree *dst, + struct dsa_switch *ds, u32 index) +{ + dst->ds[index] = NULL; + kref_put(&dst->refcount, dsa_free_dst); +} + +static bool dsa_port_is_dsa(struct device_node *port) +{ + const char *name; + + name = of_get_property(port, "label", NULL); + if (!name) + return false; + + if (!strcmp(name, "dsa")) + return true; + + return false; +} + +static bool dsa_port_is_cpu(struct device_node *port) +{ + const char *name; + + name = of_get_property(port, "label", NULL); + if (!name) + return false; + + if (!strcmp(name, "cpu")) + return true; + + return false; +} + +static bool dsa_ds_find_port(struct dsa_switch *ds, + struct device_node *port) +{ + u32 index; + + for (index = 0; index < DSA_MAX_PORTS; index++) + if (ds->ports[index].dn == port) + return true; + return false; +} + +static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst, + struct device_node *port) +{ + struct dsa_switch *ds; + u32 index; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + if (dsa_ds_find_port(ds, port)) + return ds; + } + + return NULL; +} + +static int dsa_port_complete(struct dsa_switch_tree *dst, + struct dsa_switch *src_ds, + struct device_node *port, + u32 src_port) +{ + struct device_node *link; + int index; + struct dsa_switch *dst_ds; + + for (index = 0;; index++) { + link = of_parse_phandle(port, "link", index); + if (!link) + break; + + dst_ds = dsa_dst_find_port(dst, link); + of_node_put(link); + + if (!dst_ds) + return 1; + + src_ds->rtable[dst_ds->index] = src_port; + } + + return 0; +} + +/* A switch is complete if all the DSA ports phandles point to ports + * known in the tree. A return value of 1 means the tree is not + * complete. This is not an error condition. A value of 0 is + * success. + */ +static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (!dsa_port_is_dsa(port)) + continue; + + err = dsa_port_complete(dst, ds, port, index); + if (err != 0) + return err; + + ds->dsa_port_mask |= BIT(index); + } + + return 0; +} + +/* A tree is complete if all the DSA ports phandles point to ports + * known in the tree. A return value of 1 means the tree is not + * complete. This is not an error condition. A value of 0 is + * success. + */ +static int dsa_dst_complete(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_complete(dst, ds); + if (err != 0) + return err; + } + + return 0; +} + +static int dsa_dsa_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + int err; + + err = dsa_cpu_dsa_setup(ds, ds->dev, port, index); + if (err) { + dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n", + index, err); + return err; + } + + return 0; +} + +static void dsa_dsa_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + dsa_cpu_dsa_destroy(port); +} + +static int dsa_cpu_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + int err; + + err = dsa_cpu_dsa_setup(ds, ds->dev, port, index); + if (err) { + dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n", + index, err); + return err; + } + + ds->cpu_port_mask |= BIT(index); + + return 0; +} + +static void dsa_cpu_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + dsa_cpu_dsa_destroy(port); + ds->cpu_port_mask &= ~BIT(index); + +} + +static int dsa_user_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + const char *name; + int err; + + name = of_get_property(port, "label", NULL); + + err = dsa_slave_create(ds, ds->dev, index, name); + if (err) { + dev_warn(ds->dev, "Failed to create slave %d: %d\n", + index, err); + return err; + } + + return 0; +} + +static void dsa_user_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + if (ds->ports[index].netdev) { + dsa_slave_destroy(ds->ports[index].netdev); + ds->ports[index].netdev = NULL; + ds->enabled_port_mask &= ~(1 << index); + } +} + +static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + /* Initialize ds->phys_mii_mask before registering the slave MDIO bus + * driver and before drv->setup() has run, since the switch drivers and + * the slave MDIO bus driver rely on these values for probing PHY + * devices or not + */ + ds->phys_mii_mask = ds->enabled_port_mask; + + err = ds->drv->setup(ds); + if (err < 0) + return err; + + err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + if (err < 0) + return err; + + err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + if (err < 0) + return err; + + if (!ds->slave_mii_bus && ds->drv->phy_read) { + ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); + if (!ds->slave_mii_bus) + return -ENOMEM; + + dsa_slave_mii_bus_init(ds); + + err = mdiobus_register(ds->slave_mii_bus); + if (err < 0) + return err; + } + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_dsa(port)) { + err = dsa_dsa_port_apply(port, index, ds); + if (err) + return err; + continue; + } + + if (dsa_port_is_cpu(port)) { + err = dsa_cpu_port_apply(port, index, ds); + if (err) + return err; + continue; + } + + err = dsa_user_port_apply(port, index, ds); + if (err) + continue; + } + + return 0; +} + +static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_dsa(port)) { + dsa_dsa_port_unapply(port, index, ds); + continue; + } + + if (dsa_port_is_cpu(port)) { + dsa_cpu_port_unapply(port, index, ds); + continue; + } + + dsa_user_port_unapply(port, index, ds); + } + + if (ds->slave_mii_bus && ds->drv->phy_read) + mdiobus_unregister(ds->slave_mii_bus); +} + +static int dsa_dst_apply(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_apply(dst, ds); + if (err) + return err; + } + + err = dsa_cpu_port_ethtool_setup(dst->ds[0]); + if (err) + return err; + + /* If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + dst->master_netdev->dsa_ptr = (void *)dst; + dst->applied = true; + + return 0; +} + +static void dsa_dst_unapply(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + + if (!dst->applied) + return; + + dst->master_netdev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + dsa_ds_unapply(dst, ds); + } + + dsa_cpu_port_ethtool_restore(dst->ds[0]); + + pr_info("DSA: tree %d unapplied\n", dst->tree); + dst->applied = false; +} + +static int dsa_cpu_parse(struct device_node *port, u32 index, + struct dsa_switch_tree *dst, + struct dsa_switch *ds) +{ + struct net_device *ethernet_dev; + struct device_node *ethernet; + + ethernet = of_parse_phandle(port, "ethernet", 0); + if (!ethernet) + return -EINVAL; + + ethernet_dev = of_find_net_device_by_node(ethernet); + if (!ethernet_dev) + return -EPROBE_DEFER; + + if (!ds->master_netdev) + ds->master_netdev = ethernet_dev; + + if (!dst->master_netdev) + dst->master_netdev = ethernet_dev; + + if (dst->cpu_switch == -1) { + dst->cpu_switch = ds->index; + dst->cpu_port = index; + } + + dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol); + if (IS_ERR(dst->tag_ops)) { + dev_warn(ds->dev, "No tagger for this switch\n"); + return PTR_ERR(dst->tag_ops); + } + + dst->rcv = dst->tag_ops->rcv; + + return 0; +} + +static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_cpu(port)) { + err = dsa_cpu_parse(port, index, dst, ds); + if (err) + return err; + } + } + + pr_info("DSA: switch %d %d parsed\n", dst->tree, ds->index); + + return 0; +} + +static int dsa_dst_parse(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_parse(dst, ds); + if (err) + return err; + } + + if (!dst->master_netdev) { + pr_warn("Tree has no master device\n"); + return -EINVAL; + } + + pr_info("DSA: tree %d parsed\n", dst->tree); + + return 0; +} + +static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) +{ + struct device_node *port; + int err; + u32 reg; + + for_each_available_child_of_node(ports, port) { + err = of_property_read_u32(port, "reg", ®); + if (err) + return err; + + if (reg >= DSA_MAX_PORTS) + return -EINVAL; + + ds->ports[reg].dn = port; + + /* Initialize enabled_port_mask now for drv->setup() + * to have access to a correct value, just like what + * net/dsa/dsa.c::dsa_switch_setup_one does. + */ + if (!dsa_port_is_cpu(port)) + ds->enabled_port_mask |= 1 << reg; + } + + return 0; +} + +static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index) +{ + int err; + + *tree = *index = 0; + + err = of_property_read_u32_index(np, "dsa,member", 0, tree); + if (err) { + /* Does not exist, but it is optional */ + if (err == -EINVAL) + return 0; + return err; + } + + err = of_property_read_u32_index(np, "dsa,member", 1, index); + if (err) + return err; + + if (*index >= DSA_MAX_SWITCHES) + return -EINVAL; + + return 0; +} + +static struct device_node *dsa_get_ports(struct dsa_switch *ds, + struct device_node *np) +{ + struct device_node *ports; + + ports = of_get_child_by_name(np, "ports"); + if (!ports) { + dev_err(ds->dev, "no ports child node found\n"); + return ERR_PTR(-EINVAL); + } + + return ports; +} + +static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) +{ + struct device_node *ports = dsa_get_ports(ds, np); + struct dsa_switch_tree *dst; + u32 tree, index; + int i, err; + + err = dsa_parse_member(np, &tree, &index); + if (err) + return err; + + if (IS_ERR(ports)) + return PTR_ERR(ports); + + err = dsa_parse_ports_dn(ports, ds); + if (err) + return err; + + dst = dsa_get_dst(tree); + if (!dst) { + dst = dsa_add_dst(tree); + if (!dst) + return -ENOMEM; + } + + if (dst->ds[index]) { + err = -EBUSY; + goto out; + } + + ds->dst = dst; + ds->index = index; + + /* Initialize the routing table */ + for (i = 0; i < DSA_MAX_SWITCHES; ++i) + ds->rtable[i] = DSA_RTABLE_NONE; + + dsa_dst_add_ds(dst, ds, index); + + err = dsa_dst_complete(dst); + if (err < 0) + goto out_del_dst; + + if (err == 1) { + /* Not all switches registered yet */ + err = 0; + goto out; + } + + if (dst->applied) { + pr_info("DSA: Disjoint trees?\n"); + return -EINVAL; + } + + err = dsa_dst_parse(dst); + if (err) + goto out_del_dst; + + err = dsa_dst_apply(dst); + if (err) { + dsa_dst_unapply(dst); + goto out_del_dst; + } + + dsa_put_dst(dst); + return 0; + +out_del_dst: + dsa_dst_del_ds(dst, ds, ds->index); +out: + dsa_put_dst(dst); + + return err; +} + +int dsa_register_switch(struct dsa_switch *ds, struct device_node *np) +{ + int err; + + mutex_lock(&dsa2_mutex); + err = _dsa_register_switch(ds, np); + mutex_unlock(&dsa2_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(dsa_register_switch); + +static void _dsa_unregister_switch(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + dsa_dst_unapply(dst); + + dsa_dst_del_ds(dst, ds, ds->index); +} + +void dsa_unregister_switch(struct dsa_switch *ds) +{ + mutex_lock(&dsa2_mutex); + _dsa_unregister_switch(ds); + mutex_unlock(&dsa2_mutex); +} +EXPORT_SYMBOL_GPL(dsa_unregister_switch); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index dfa33779d..00077a9c9 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -50,12 +50,19 @@ struct dsa_slave_priv { /* dsa.c */ extern char dsa_driver_version[]; +int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, + struct device_node *port_dn, int port); +void dsa_cpu_dsa_destroy(struct device_node *port_dn); +const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); +int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); +void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); +void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops); int dsa_slave_create(struct dsa_switch *ds, struct device *parent, - int port, char *name); + int port, const char *name); void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 152436cda..fc9196745 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -49,8 +49,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->name = "dsa slave smi"; ds->slave_mii_bus->read = dsa_slave_phy_read; ds->slave_mii_bus->write = dsa_slave_phy_write; - snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", - ds->index, ds->cd->sw_addr); + snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", + ds->dst->tree, ds->index); ds->slave_mii_bus->parent = ds->dev; ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; } @@ -333,6 +333,44 @@ static int dsa_slave_vlan_filtering(struct net_device *dev, return 0; } +static int dsa_fastest_ageing_time(struct dsa_switch *ds, + unsigned int ageing_time) +{ + int i; + + for (i = 0; i < DSA_MAX_PORTS; ++i) { + struct dsa_port *dp = &ds->ports[i]; + + if (dp && dp->ageing_time && dp->ageing_time < ageing_time) + ageing_time = dp->ageing_time; + } + + return ageing_time; +} + +static int dsa_slave_ageing_time(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); + unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); + + /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ + if (switchdev_trans_ph_prepare(trans)) + return 0; + + /* Keep the fastest ageing time in case of multiple bridges */ + ds->ports[p->port].ageing_time = ageing_time; + ageing_time = dsa_fastest_ageing_time(ds, ageing_time); + + if (ds->drv->set_ageing_time) + return ds->drv->set_ageing_time(ds, ageing_time); + + return 0; +} + static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -346,6 +384,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: ret = dsa_slave_vlan_filtering(dev, attr, trans); break; + case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: + ret = dsa_slave_ageing_time(dev, attr, trans); + break; default: ret = -EOPNOTSUPP; break; @@ -522,14 +563,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - /* Just return the original SKB */ - return skb; -} - - /* ethtool operations *******************************************************/ static int dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) @@ -615,7 +648,7 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->cd->eeprom_len) + if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; if (ds->drv->get_eeprom_len) @@ -873,6 +906,13 @@ static void dsa_slave_poll_controller(struct net_device *dev) } #endif +void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) +{ + ops->get_sset_count = dsa_cpu_port_get_sset_count; + ops->get_ethtool_stats = dsa_cpu_port_get_ethtool_stats; + ops->get_strings = dsa_cpu_port_get_strings; +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -893,8 +933,6 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, }; -static struct ethtool_ops dsa_cpu_port_ethtool_ops; - static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -999,13 +1037,12 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) { struct dsa_switch *ds = p->parent; - struct dsa_chip_data *cd = ds->cd; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; u32 phy_flags = 0; int mode, ret; - port_dn = cd->port_dn[p->port]; + port_dn = ds->ports[p->port].dn; mode = of_get_phy_mode(port_dn); if (mode < 0) mode = PHY_INTERFACE_MODE_NA; @@ -1109,14 +1146,18 @@ int dsa_slave_resume(struct net_device *slave_dev) } int dsa_slave_create(struct dsa_switch *ds, struct device *parent, - int port, char *name) + int port, const char *name) { - struct net_device *master = ds->dst->master_netdev; struct dsa_switch_tree *dst = ds->dst; + struct net_device *master; struct net_device *slave_dev; struct dsa_slave_priv *p; int ret; + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name, NET_NAME_UNKNOWN, ether_setup); if (slave_dev == NULL) @@ -1124,19 +1165,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->features = master->vlan_features; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; - if (master->ethtool_ops != &dsa_cpu_port_ethtool_ops) { - memcpy(&dst->master_ethtool_ops, master->ethtool_ops, - sizeof(struct ethtool_ops)); - memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops, - sizeof(struct ethtool_ops)); - dsa_cpu_port_ethtool_ops.get_sset_count = - dsa_cpu_port_get_sset_count; - dsa_cpu_port_ethtool_ops.get_ethtool_stats = - dsa_cpu_port_get_ethtool_stats; - dsa_cpu_port_ethtool_ops.get_strings = - dsa_cpu_port_get_strings; - master->ethtool_ops = &dsa_cpu_port_ethtool_ops; - } eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; @@ -1147,49 +1175,24 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, NULL); SET_NETDEV_DEV(slave_dev, parent); - slave_dev->dev.of_node = ds->cd->port_dn[port]; + slave_dev->dev.of_node = ds->ports[port].dn; slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); p->parent = ds; p->port = port; - - switch (ds->dst->tag_protocol) { -#ifdef CONFIG_NET_DSA_TAG_DSA - case DSA_TAG_PROTO_DSA: - p->xmit = dsa_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - case DSA_TAG_PROTO_EDSA: - p->xmit = edsa_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - case DSA_TAG_PROTO_TRAILER: - p->xmit = trailer_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM - case DSA_TAG_PROTO_BRCM: - p->xmit = brcm_netdev_ops.xmit; - break; -#endif - default: - p->xmit = dsa_slave_notag_xmit; - break; - } + p->xmit = dst->tag_ops->xmit; p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; - ds->ports[port] = slave_dev; + ds->ports[port].netdev = slave_dev; ret = register_netdev(slave_dev); if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); - ds->ports[port] = NULL; + ds->ports[port].netdev = NULL; free_netdev(slave_dev); return ret; } diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e2aadb731..21bffde6e 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -127,7 +127,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, source_port = brcm_tag[3] & BRCM_EG_PID_MASK; /* Validate port against switch setup, either the port is totally */ - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* Remove Broadcom tag and update checksum */ @@ -140,7 +140,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb->protocol = eth_type_trans(skb, skb->dev); skb->dev->stats.rx_packets++; diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index aa780e4ac..bce79ffe3 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -107,10 +107,14 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, * Check that the source device exists and that the source * port is a registered DSA port. */ - if (source_device >= dst->pd->nr_chips) + if (source_device >= DSA_MAX_SWITCHES) goto out_drop; + ds = dst->ds[source_device]; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (!ds) + goto out_drop; + + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* @@ -159,7 +163,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 2288c8098..6c1720e88 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -120,10 +120,14 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, * Check that the source device exists and that the source * port is a registered DSA port. */ - if (source_device >= dst->pd->nr_chips) + if (source_device >= DSA_MAX_SWITCHES) goto out_drop; + ds = dst->ds[source_device]; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (!ds) + goto out_drop; + + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* @@ -178,7 +182,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index b6ca0890d..5e3903eb1 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -82,12 +82,12 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, goto out_drop; source_port = trailer[1] & 7; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; pskb_trim_rcsum(skb, skb->len - 4); - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index dd085db85..d7efbf0da 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,21 +58,10 @@ static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; -static struct lock_class_key lowpan_tx_busylock; -static struct lock_class_key lowpan_netdev_xmit_lock_key; - -static void lowpan_set_lockdep_class_one(struct net_device *ldev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, - &lowpan_netdev_xmit_lock_key); -} - static int lowpan_dev_init(struct net_device *ldev) { - netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL); - ldev->qdisc_tx_busylock = &lowpan_tx_busylock; + netdev_lockdep_set_classes(ldev); + return 0; } @@ -92,11 +81,21 @@ static int lowpan_stop(struct net_device *dev) return 0; } +static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) +{ + struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); + + /* default no short_addr is available for a neighbour */ + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + return 0; +} + static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, + .ndo_neigh_construct = lowpan_neigh_construct, }; static void lowpan_setup(struct net_device *ldev) @@ -131,8 +130,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, pr_debug("adding new link\n"); - if (!tb[IFLA_LINK] || - !net_eq(dev_net(ldev), &init_net)) + if (!tb[IFLA_LINK]) return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); @@ -161,6 +159,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; + ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); + ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index ef185dd41..649e7d45e 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -262,7 +262,7 @@ static inline bool lowpan_rx_h_check(struct sk_buff *skb) /* check on ieee802154 conform 6LoWPAN header */ if (!ieee802154_is_data(fc) || - !ieee802154_is_intra_pan(fc)) + !ieee802154_skb_is_intra_pan_addressing(fc, skb)) return false; /* check if we can dereference the dispatch */ diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index e459afd16..dbb476d7d 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -9,6 +9,7 @@ */ #include +#include #include #include @@ -17,19 +18,9 @@ #define LOWPAN_FRAG1_HEAD_SIZE 0x4 #define LOWPAN_FRAGN_HEAD_SIZE 0x5 -/* don't save pan id, it's intra pan */ -struct lowpan_addr { - u8 mode; - union { - /* IPv6 needs big endian here */ - __be64 extended_addr; - __be16 short_addr; - } u; -}; - struct lowpan_addr_info { - struct lowpan_addr daddr; - struct lowpan_addr saddr; + struct ieee802154_addr daddr; + struct ieee802154_addr saddr; }; static inline struct @@ -48,12 +39,14 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb) * RAW/DGRAM sockets. */ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { - const u8 *saddr = _saddr; - const u8 *daddr = _daddr; - struct lowpan_addr_info *info; + struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr; + struct lowpan_addr_info *info = lowpan_skb_priv(skb); + struct lowpan_802154_neigh *llneigh = NULL; + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct neighbour *n; /* TODO: * if this package isn't ipv6 one, where should it be routed? @@ -61,21 +54,50 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, if (type != ETH_P_IPV6) return 0; - if (!saddr) - saddr = ldev->dev_addr; + /* intra-pan communication */ + info->saddr.pan_id = wpan_dev->pan_id; + info->daddr.pan_id = info->saddr.pan_id; - raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); - raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); + if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) { + info->daddr.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + info->daddr.mode = IEEE802154_ADDR_SHORT; + } else { + __le16 short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + + n = neigh_lookup(&nd_tbl, &hdr->daddr, ldev); + if (n) { + llneigh = lowpan_802154_neigh(neighbour_priv(n)); + read_lock_bh(&n->lock); + short_addr = llneigh->short_addr; + read_unlock_bh(&n->lock); + } - info = lowpan_skb_priv(skb); + if (llneigh && + lowpan_802154_is_valid_src_short_addr(short_addr)) { + info->daddr.short_addr = short_addr; + info->daddr.mode = IEEE802154_ADDR_SHORT; + } else { + info->daddr.mode = IEEE802154_ADDR_LONG; + ieee802154_be64_to_le64(&info->daddr.extended_addr, + daddr); + } - /* TODO: Currently we only support extended_addr */ - info->daddr.mode = IEEE802154_ADDR_LONG; - memcpy(&info->daddr.u.extended_addr, daddr, - sizeof(info->daddr.u.extended_addr)); - info->saddr.mode = IEEE802154_ADDR_LONG; - memcpy(&info->saddr.u.extended_addr, saddr, - sizeof(info->daddr.u.extended_addr)); + if (n) + neigh_release(n); + } + + if (!saddr) { + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) { + info->saddr.mode = IEEE802154_ADDR_SHORT; + info->saddr.short_addr = wpan_dev->short_addr; + } else { + info->saddr.mode = IEEE802154_ADDR_LONG; + info->saddr.extended_addr = wpan_dev->extended_addr; + } + } else { + info->saddr.mode = IEEE802154_ADDR_LONG; + ieee802154_be64_to_le64(&info->saddr.extended_addr, saddr); + } return 0; } @@ -209,47 +231,26 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, u16 *dgram_size, u16 *dgram_offset) { struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr; - struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; - void *daddr, *saddr; memcpy(&info, lowpan_skb_priv(skb), sizeof(info)); - /* TODO: Currently we only support extended_addr */ - daddr = &info.daddr.u.extended_addr; - saddr = &info.saddr.u.extended_addr; - *dgram_size = skb->len; - lowpan_header_compress(skb, ldev, daddr, saddr); + lowpan_header_compress(skb, ldev, &info.daddr, &info.saddr); /* dgram_offset = (saved bytes after compression) + lowpan header len */ *dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb); cb->type = IEEE802154_FC_TYPE_DATA; - /* prepare wpan address data */ - sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = wpan_dev->pan_id; - sa.extended_addr = ieee802154_devaddr_from_raw(saddr); - - /* intra-PAN communications */ - da.pan_id = sa.pan_id; - - /* if the destination address is the broadcast address, use the - * corresponding short address - */ - if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) { - da.mode = IEEE802154_ADDR_SHORT; - da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + if (info.daddr.mode == IEEE802154_ADDR_SHORT && + ieee802154_is_broadcast_short_addr(info.daddr.short_addr)) cb->ackreq = false; - } else { - da.mode = IEEE802154_ADDR_LONG; - da.extended_addr = ieee802154_devaddr_from_raw(daddr); + else cb->ackreq = wpan_dev->ackreq; - } - return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, &da, - &sa, 0); + return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, + &info.daddr, &info.saddr, 0); } netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index c35fdfa6d..cb7176cd4 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -140,6 +140,8 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) rdev->wpan_phy.dev.class = &wpan_phy_class; rdev->wpan_phy.dev.platform_data = rdev; + wpan_phy_net_set(&rdev->wpan_phy, &init_net); + init_waitqueue_head(&rdev->dev_wait); return &rdev->wpan_phy; @@ -207,6 +209,49 @@ void wpan_phy_free(struct wpan_phy *phy) } EXPORT_SYMBOL(wpan_phy_free); +int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, + struct net *net) +{ + struct wpan_dev *wpan_dev; + int err = 0; + + list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { + if (!wpan_dev->netdev) + continue; + wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); + if (err) + break; + wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + } + + if (err) { + /* failed -- clean up to old netns */ + net = wpan_phy_net(&rdev->wpan_phy); + + list_for_each_entry_continue_reverse(wpan_dev, + &rdev->wpan_dev_list, + list) { + if (!wpan_dev->netdev) + continue; + wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + err = dev_change_net_namespace(wpan_dev->netdev, net, + "wpan%d"); + WARN_ON(err); + wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + } + + return err; + } + + wpan_phy_net_set(&rdev->wpan_phy, net); + + err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev)); + WARN_ON(err); + + return 0; +} + void cfg802154_dev_free(struct cfg802154_registered_device *rdev) { kfree(rdev); @@ -286,14 +331,34 @@ static struct notifier_block cfg802154_netdev_notifier = { .notifier_call = cfg802154_netdev_notifier_call, }; +static void __net_exit cfg802154_pernet_exit(struct net *net) +{ + struct cfg802154_registered_device *rdev; + + rtnl_lock(); + list_for_each_entry(rdev, &cfg802154_rdev_list, list) { + if (net_eq(wpan_phy_net(&rdev->wpan_phy), net)) + WARN_ON(cfg802154_switch_netns(rdev, &init_net)); + } + rtnl_unlock(); +} + +static struct pernet_operations cfg802154_pernet_ops = { + .exit = cfg802154_pernet_exit, +}; + static int __init wpan_phy_class_init(void) { int rc; - rc = wpan_phy_sysfs_init(); + rc = register_pernet_device(&cfg802154_pernet_ops); if (rc) goto err; + rc = wpan_phy_sysfs_init(); + if (rc) + goto err_sysfs; + rc = register_netdevice_notifier(&cfg802154_netdev_notifier); if (rc) goto err_nl; @@ -315,6 +380,8 @@ err_notifier: unregister_netdevice_notifier(&cfg802154_netdev_notifier); err_nl: wpan_phy_sysfs_exit(); +err_sysfs: + unregister_pernet_device(&cfg802154_pernet_ops); err: return rc; } @@ -326,6 +393,7 @@ static void __exit wpan_phy_class_exit(void) ieee802154_nl_exit(); unregister_netdevice_notifier(&cfg802154_netdev_notifier); wpan_phy_sysfs_exit(); + unregister_pernet_device(&cfg802154_pernet_ops); } module_exit(wpan_phy_class_exit); diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h index 231fade95..81141f58d 100644 --- a/net/ieee802154/core.h +++ b/net/ieee802154/core.h @@ -38,6 +38,8 @@ wpan_phy_to_rdev(struct wpan_phy *wpan_phy) extern struct list_head cfg802154_rdev_list; extern int cfg802154_rdev_list_generation; +int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, + struct net *net); /* free object */ void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 116187b5c..d90a4ed5b 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -80,7 +80,8 @@ __cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs) list_for_each_entry(rdev, &cfg802154_rdev_list, list) { struct wpan_dev *wpan_dev; - /* TODO netns compare */ + if (wpan_phy_net(&rdev->wpan_phy) != netns) + continue; if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx) continue; @@ -175,7 +176,8 @@ __cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs) if (!rdev) return ERR_PTR(-ENODEV); - /* TODO netns compare */ + if (netns != wpan_phy_net(&rdev->wpan_phy)) + return ERR_PTR(-ENODEV); return rdev; } @@ -233,6 +235,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, + [NL802154_ATTR_PID] = { .type = NLA_U32 }, + [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 }, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, @@ -590,7 +594,6 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct cfg802154_registered_device *rdev; int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]); - /* TODO netns */ netdev = __dev_get_by_index(&init_net, ifidx); if (!netdev) return -ENODEV; @@ -629,7 +632,8 @@ nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb) } list_for_each_entry(rdev, &cfg802154_rdev_list, list) { - /* TODO net ns compare */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) + continue; if (++idx <= state->start) continue; if (state->filter_wpan_phy != -1 && @@ -871,7 +875,8 @@ nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { - /* TODO netns compare */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) + continue; if (wp_idx < wp_start) { wp_idx++; continue; @@ -1271,6 +1276,37 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); } +static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net *net; + int err; + + if (info->attrs[NL802154_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[NL802154_ATTR_PID]); + + net = get_net_ns_by_pid(pid); + } else if (info->attrs[NL802154_ATTR_NETNS_FD]) { + u32 fd = nla_get_u32(info->attrs[NL802154_ATTR_NETNS_FD]); + + net = get_net_ns_by_fd(fd); + } else { + return -EINVAL; + } + + if (IS_ERR(net)) + return PTR_ERR(net); + + err = 0; + + /* check if anything to do */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), net)) + err = cfg802154_switch_netns(rdev, net); + + put_net(net); + return err; +} + #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, @@ -2261,6 +2297,14 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_SET_WPAN_PHY_NETNS, + .doit = nl802154_wpan_phy_netns, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, { .cmd = NL802154_CMD_SET_PAN_ID, .doit = nl802154_set_pan_id, diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index eb51c43c2..c4ed105ac 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -532,6 +532,22 @@ config TCP_CONG_VEGAS window. TCP Vegas should provide less packet loss, but it is not as aggressive as TCP Reno. +config TCP_CONG_NV + tristate "TCP NV" + default n + ---help--- + TCP NV is a follow up to TCP Vegas. It has been modified to deal with + 10G networks, measurement noise introduced by LRO, GRO and interrupt + coalescence. In addition, it will decrease its cwnd multiplicatively + instead of linearly. + + Note that in general congestion avoidance (cwnd decreased when # packets + queued grows) cannot coexist with congestion control (cwnd decreased only + when there is packet loss) due to fairness issues. One scenario when they + can coexist safely is when the CA flows have RTTs << CC flows RTTs. + + For further details see http://www.brakmo.org/networking/tcp-nv/ + config TCP_CONG_SCALABLE tristate "Scalable TCP" default n diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bfa133691..24629b6f5 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d39e9e47a..55513e654 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -73,7 +73,7 @@ #include #include #include -#include +#include #include #include #include @@ -1916,6 +1916,3 @@ static int __init ipv4_proc_init(void) return 0; } #endif /* CONFIG_PROC_FS */ - -MODULE_ALIAS_NETPROTO(PF_INET); - diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 40d6b8771..72d6f056d 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -134,76 +134,6 @@ int cipso_v4_rbm_strictvalid = 1; * Helper Functions */ -/** - * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit - * @bitmap: the bitmap - * @bitmap_len: length in bits - * @offset: starting offset - * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit - * - * Description: - * Starting at @offset, walk the bitmap from left to right until either the - * desired bit is found or we reach the end. Return the bit offset, -1 if - * not found, or -2 if error. - */ -static int cipso_v4_bitmap_walk(const unsigned char *bitmap, - u32 bitmap_len, - u32 offset, - u8 state) -{ - u32 bit_spot; - u32 byte_offset; - unsigned char bitmask; - unsigned char byte; - - /* gcc always rounds to zero when doing integer division */ - byte_offset = offset / 8; - byte = bitmap[byte_offset]; - bit_spot = offset; - bitmask = 0x80 >> (offset % 8); - - while (bit_spot < bitmap_len) { - if ((state && (byte & bitmask) == bitmask) || - (state == 0 && (byte & bitmask) == 0)) - return bit_spot; - - bit_spot++; - bitmask >>= 1; - if (bitmask == 0) { - byte = bitmap[++byte_offset]; - bitmask = 0x80; - } - } - - return -1; -} - -/** - * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap - * @bitmap: the bitmap - * @bit: the bit - * @state: if non-zero, set the bit (1) else clear the bit (0) - * - * Description: - * Set a single bit in the bitmask. Returns zero on success, negative values - * on error. - */ -static void cipso_v4_bitmap_setbit(unsigned char *bitmap, - u32 bit, - u8 state) -{ - u32 byte_spot; - u8 bitmask; - - /* gcc always rounds to zero when doing integer division */ - byte_spot = bit / 8; - bitmask = 0x80 >> (bit % 8); - if (state) - bitmap[byte_spot] |= bitmask; - else - bitmap[byte_spot] &= ~bitmask; -} - /** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free @@ -840,10 +770,10 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { - cat = cipso_v4_bitmap_walk(bitmap, - bitmap_len_bits, - cat + 1, - 1); + cat = netlbl_bitmap_walk(bitmap, + bitmap_len_bits, + cat + 1, + 1); if (cat < 0) break; if (cat >= cipso_cat_size || @@ -909,7 +839,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, } if (net_spot >= net_clen_bits) return -ENOSPC; - cipso_v4_bitmap_setbit(net_cat, net_spot, 1); + netlbl_bitmap_setbit(net_cat, net_spot, 1); if (net_spot > net_spot_max) net_spot_max = net_spot; @@ -951,10 +881,10 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, } for (;;) { - net_spot = cipso_v4_bitmap_walk(net_cat, - net_clen_bits, - net_spot + 1, - 1); + net_spot = netlbl_bitmap_walk(net_cat, + net_clen_bits, + net_spot + 1, + 1); if (net_spot < 0) { if (net_spot == -2) return -EFAULT; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e333bc86b..062a67ca9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1834,7 +1834,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; @@ -1846,7 +1846,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: if (err < 0) @@ -1903,7 +1903,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; @@ -2027,16 +2027,16 @@ static void inet_forward_change(struct net *net) for_each_netdev(net, dev) { struct in_device *in_dev; + if (on) dev_disable_lro(dev); - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); + + in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } - rcu_read_unlock(); } } @@ -2232,7 +2232,7 @@ static struct devinet_sysctl_table { }; static int __devinet_sysctl_register(struct net *net, char *dev_name, - struct ipv4_devconf *p) + int ifindex, struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; @@ -2255,6 +2255,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, goto free; p->sysctl = t; + + inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p); return 0; free: @@ -2286,7 +2288,7 @@ static int devinet_sysctl_register(struct in_device *idev) if (err) return err; err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, - &idev->cnf); + idev->dev->ifindex, &idev->cnf); if (err) neigh_sysctl_unregister(idev->arp_parms); return err; @@ -2347,11 +2349,12 @@ static __net_init int devinet_init_net(struct net *net) } #ifdef CONFIG_SYSCTL - err = __devinet_sysctl_register(net, "all", all); + err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all); if (err < 0) goto err_reg_all; - err = __devinet_sysctl_register(net, "default", dflt); + err = __devinet_sysctl_register(net, "default", + NETCONFA_IFINDEX_DEFAULT, dflt); if (err < 0) goto err_reg_dflt; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ef2ebeb89..1b25daf8c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -509,6 +509,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; + cfg->fc_table = l3mdev_fib_table(dev); if (colon) { struct in_ifaddr *ifa; struct in_device *in_dev = __in_dev_get_rtnl(dev); @@ -1027,7 +1028,7 @@ no_promotions: * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ - if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) + if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f2bda9e89..6e9ea69e5 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -76,6 +76,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, { int err = -EAGAIN; struct fib_table *tbl; + u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: @@ -94,7 +95,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, rcu_read_lock(); - tbl = fib_get_table(rule->fr_net, rule->table); + tb_id = fib_rule_get_table(rule, arg); + tbl = fib_get_table(rule->fr_net, tb_id); if (tbl) err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *)arg->result, @@ -180,7 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (err) goto errout; - if (rule->table == RT_TABLE_UNSPEC) { + if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 539fa264e..e9f56225e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; fi->fib_type = cfg->fc_type; + fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; change_nexthops(fi) { @@ -1337,18 +1338,21 @@ nla_put_failure: * referring to it. * - device went down -> we must shutdown all nexthops going via it. */ -int fib_sync_down_addr(struct net *net, __be32 local) +int fib_sync_down_addr(struct net_device *dev, __be32 local) { int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + struct net *net = dev_net(dev); + int tb_id = l3mdev_fib_table(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) return 0; hlist_for_each_entry(fi, head, fib_lhash) { - if (!net_eq(fi->fib_net, net)) + if (!net_eq(fi->fib_net, net) || + fi->fib_tb_id != tb_id) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index febca0f10..e2ffc2a5c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -249,7 +249,7 @@ static inline unsigned long get_index(t_key key, struct key_vector *kv) * index into the parent's child array. That is, they will be used to find * 'n' among tp's children. * - * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits + * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits * for the node n. * * All the bits we have seen so far are significant to the node n. The rest @@ -258,7 +258,7 @@ static inline unsigned long get_index(t_key key, struct key_vector *kv) * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into * n's child array, and will of course be different for each child. * - * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown + * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown * at this point. */ diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 5f9207c03..321d57f82 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -129,6 +129,36 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + + int prot; + + switch (((struct iphdr *)guehdr)->version) { + case 4: + prot = IPPROTO_IPIP; + break; + case 6: + prot = IPPROTO_IPV6; + break; + default: + goto drop; + } + + if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) + goto drop; + + return -prot; + } + + default: /* Undefined version */ + goto drop; + } + optlen = guehdr->hlen << 2; len += optlen; @@ -289,6 +319,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, int flush = 1; struct fou *fou = fou_from_sock(sk); struct gro_remcsum grc; + u8 proto; skb_gro_remcsum_init(&grc); @@ -302,6 +333,25 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, goto out; } + switch (guehdr->version) { + case 0: + break; + case 1: + switch (((struct iphdr *)guehdr)->version) { + case 4: + proto = IPPROTO_IPIP; + break; + case 6: + proto = IPPROTO_IPV6; + break; + default: + goto out; + } + goto next_proto; + default: + goto out; + } + optlen = guehdr->hlen << 2; len += optlen; @@ -370,6 +420,10 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, } } + proto = guehdr->proto_ctype; + +next_proto: + /* We can clear the encap_mark for GUE as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are are simply @@ -383,7 +437,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[guehdr->proto_ctype]); + ops = rcu_dereference(offloads[proto]); if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; @@ -404,13 +458,30 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) const struct net_offload **offloads; struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); const struct net_offload *ops; - unsigned int guehlen; + unsigned int guehlen = 0; u8 proto; int err = -ENOENT; - proto = guehdr->proto_ctype; - - guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + switch (guehdr->version) { + case 0: + proto = guehdr->proto_ctype; + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + break; + case 1: + switch (((struct iphdr *)guehdr)->version) { + case 4: + proto = IPPROTO_IPIP; + break; + case 6: + proto = IPPROTO_IPV6; + break; + default: + return err; + } + break; + default: + return err; + } rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index de1d119a4..b798862b6 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -117,6 +117,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if ((*(u8 *)options & 0xF0) != 0x40) hdr_len += 4; } + tpi->hdr_len = hdr_len; return hdr_len; } EXPORT_SYMBOL(gre_parse_header); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fa8c39804..61a9deec2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -603,7 +603,7 @@ static void reqsk_timer_handler(unsigned long data) if (req->num_timeout++ == 0) atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); - mod_timer_pinned(&req->rsk_timer, jiffies + timeo); + mod_timer(&req->rsk_timer, jiffies + timeo); return; } drop: @@ -617,8 +617,9 @@ static void reqsk_queue_hash_req(struct request_sock *req, req->num_timeout = 0; req->sk = NULL; - setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); - mod_timer_pinned(&req->rsk_timer, jiffies + timeout); + setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler, + (unsigned long)req); + mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 25af12436..38c2c47fe 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -44,6 +44,7 @@ struct inet_diag_entry { u16 dport; u16 family; u16 userlocks; + u32 ifindex; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -571,6 +572,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_DEV_COND: { + u32 ifindex; + + ifindex = *((const u32 *)(op + 1)); + if (ifindex != entry->ifindex) + yes = 0; + break; + } } if (yes) { @@ -613,6 +622,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); + entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; return inet_diag_bc_run(bc, &entry); @@ -636,6 +646,17 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } +/* data is u32 ifindex */ +static bool valid_devcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + /* Check ifindex space. */ + *min_len += sizeof(u32); + if (len < *min_len) + return false; + + return true; +} /* Validate an inet_diag_hostcond. */ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) @@ -700,6 +721,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_DEV_COND: + if (!valid_devcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 3a88b0c73..b5e9317ea 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -355,7 +355,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; - if (frag_mem_limit(nf) > nf->high_thresh) { + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { inet_frag_schedule_worker(f); return NULL; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 206581674..ddcd56c08 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -188,7 +188,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw); + setup_pinned_timer(&tw->tw_timer, tw_timer_handler, + (unsigned long)tw); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this @@ -248,7 +249,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) tw->tw_kill = timeo <= 4*HZ; if (!rearm) { - BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); + BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); } else { mod_timer_pending(&tw->tw_timer, jiffies + timeo); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index cbfb1808f..8b4ffd216 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -54,7 +54,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; @@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - IPCB(skb)->flags |= IPSKB_FORWARDED; + IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1d000af7f..113cc43df 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -138,6 +138,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + unsigned int data_len = 0; struct ip_tunnel *t; switch (type) { @@ -163,6 +164,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return; + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -181,6 +183,13 @@ static void ipgre_err(struct sk_buff *skb, u32 info, if (!t) return; +#if IS_ENABLED(CONFIG_IPV6) + if (tpi->proto == htons(ETH_P_IPV6) && + !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return; +#endif + if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) return; @@ -361,7 +370,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->parms.o_flags, proto, tunnel->parms.o_key, htonl(tunnel->o_seqno)); - skb_set_inner_protocol(skb, proto); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -837,17 +845,19 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct net_device *dev, +static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], struct ip_tunnel_parm *parms) { + struct ip_tunnel *t = netdev_priv(dev); + memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_GRE; if (!data) - return; + return 0; if (data[IFLA_GRE_LINK]) parms->link = nla_get_u32(data[IFLA_GRE_LINK]); @@ -876,16 +886,26 @@ static void ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); - if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) { + if (t->ignore_df) + return -EINVAL; parms->iph.frag_off = htons(IP_DF); + } if (data[IFLA_GRE_COLLECT_METADATA]) { - struct ip_tunnel *t = netdev_priv(dev); - t->collect_md = true; if (dev->type == ARPHRD_IPGRE) dev->type = ARPHRD_NONE; } + + if (data[IFLA_GRE_IGNORE_DF]) { + if (nla_get_u8(data[IFLA_GRE_IGNORE_DF]) + && (parms->iph.frag_off & htons(IP_DF))) + return -EINVAL; + t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); + } + + return 0; } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -956,16 +976,19 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_newlink(dev, tb, &p); } @@ -974,16 +997,19 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_changelink(dev, tb, &p); } @@ -1020,6 +1046,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_GRE_IGNORE_DF */ + nla_total_size(1) + 0; } @@ -1053,6 +1081,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df)) + goto nla_put_failure; + if (t->collect_md) { if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) goto nla_put_failure; @@ -1080,6 +1111,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 4b351af3e..d6feabb03 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + struct net_device *dev = skb->dev; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + iph->tos, dev); if (unlikely(err)) { if (err == -EXDEV) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); @@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4bd492163..dde37fb34 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -223,9 +223,11 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *segs; int ret = 0; - /* common case: locally created skb or seglen is <= mtu */ - if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= mtu) + /* common case: fragmentation of segments is not allowed, + * or seglen is <= mtu + */ + if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) || + skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d8f5e0a26..95649ebd2 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -682,7 +682,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off&htons(IP_DF)); max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index afd6b5968..0f227db0e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -63,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); struct net_device *dev = skb->dev; + int skb_iif = skb->skb_iif; struct iphdr *iph; int err; @@ -72,6 +73,16 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + if (skb_iif && !(df & htons(IP_DF))) { + /* Arrived from an ingress interface, got encapsulated, with + * fragmentation of encapulating frames allowed. + * If skb is gso, the resulting encapsulated network segments + * may exceed dst mtu. + * Allow IP Fragmentation of segments. + */ + IPCB(skb)->flags |= IPSKB_FRAG_SEGS; + } + /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index cc701fa70..5d7944f39 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; + struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; @@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) } x = xfrm_input_state(skb); - family = x->inner_mode->afinfo->family; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + family = inner_mode->afinfo->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 978370132..4ae3f8e6c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -148,14 +148,14 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->parms.link, 0, IPPROTO_IPIP, 0); + t->parms.link, 0, iph->protocol, 0); err = 0; goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, - IPPROTO_IPIP, 0); + iph->protocol, 0); err = 0; goto out; } @@ -177,12 +177,19 @@ out: return err; } -static const struct tnl_ptk_info tpi = { +static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; -static int ipip_rcv(struct sk_buff *skb) +#if IS_ENABLED(CONFIG_MPLS) +static const struct tnl_ptk_info mplsip_tpi = { + /* no tunnel info required for mplsip. */ + .proto = htons(ETH_P_MPLS_UC), +}; +#endif + +static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); @@ -193,11 +200,23 @@ static int ipip_rcv(struct sk_buff *skb) tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel) { + const struct tnl_ptk_info *tpi; + + if (tunnel->parms.iph.protocol != ipproto && + tunnel->parms.iph.protocol != 0) + goto drop; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto, false)) +#if IS_ENABLED(CONFIG_MPLS) + if (ipproto == IPPROTO_MPLS) + tpi = &mplsip_tpi; + else +#endif + tpi = &ipip_tpi; + if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return -1; @@ -207,24 +226,51 @@ drop: return 0; } +static int ipip_rcv(struct sk_buff *skb) +{ + return ipip_tunnel_rcv(skb, IPPROTO_IPIP); +} + +#if IS_ENABLED(CONFIG_MPLS) +static int mplsip_rcv(struct sk_buff *skb) +{ + return ipip_tunnel_rcv(skb, IPPROTO_MPLS); +} +#endif + /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ -static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; + u8 ipproto; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ipproto = IPPROTO_IPIP; + break; +#if IS_ENABLED(CONFIG_MPLS) + case htons(ETH_P_MPLS_UC): + ipproto = IPPROTO_MPLS; + break; +#endif + default: + goto tx_error; + } - if (unlikely(skb->protocol != htons(ETH_P_IP))) + if (tiph->protocol != ipproto && tiph->protocol != 0) goto tx_error; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; - skb_set_inner_ipproto(skb, IPPROTO_IPIP); + skb_set_inner_ipproto(skb, ipproto); - ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); + ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: @@ -234,6 +280,20 @@ tx_error: return NETDEV_TX_OK; } +static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) +{ + switch (ipproto) { + case 0: + case IPPROTO_IPIP: +#if IS_ENABLED(CONFIG_MPLS) + case IPPROTO_MPLS: +#endif + return true; + } + + return false; +} + static int ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -244,7 +304,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EFAULT; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + if (p.iph.version != 4 || + !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) return -EINVAL; } @@ -301,10 +362,23 @@ static int ipip_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 0; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; - tunnel->parms.iph.protocol = IPPROTO_IPIP; return ip_tunnel_init(dev); } +static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + u8 proto; + + if (!data || !data[IFLA_IPTUN_PROTO]) + return 0; + + proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0) + return -EINVAL; + + return 0; +} + static void ipip_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms) { @@ -335,6 +409,9 @@ static void ipip_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); + if (data[IFLA_IPTUN_PROTO]) + parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); } @@ -427,6 +504,8 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + + /* IFLA_IPTUN_PROTO */ + nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ @@ -450,6 +529,7 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF)))) goto nla_put_failure; @@ -476,6 +556,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, + [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, @@ -489,6 +570,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = { .policy = ipip_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip_tunnel_setup, + .validate = ipip_tunnel_validate, .newlink = ipip_newlink, .changelink = ipip_changelink, .dellink = ip_tunnel_dellink, @@ -503,6 +585,14 @@ static struct xfrm_tunnel ipip_handler __read_mostly = { .priority = 1, }; +#if IS_ENABLED(CONFIG_MPLS) +static struct xfrm_tunnel mplsip_handler __read_mostly = { + .handler = mplsip_rcv, + .err_handler = ipip_err, + .priority = 1, +}; +#endif + static int __net_init ipip_init_net(struct net *net) { return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); @@ -525,7 +615,7 @@ static int __init ipip_init(void) { int err; - pr_info("ipip: IPv4 over IPv4 tunneling driver\n"); + pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&ipip_net_ops); if (err < 0) @@ -533,8 +623,15 @@ static int __init ipip_init(void) err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); - goto xfrm_tunnel_failed; + goto xfrm_tunnel_ipip_failed; + } +#if IS_ENABLED(CONFIG_MPLS) + err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); + if (err < 0) { + pr_info("%s: can't register tunnel\n", __func__); + goto xfrm_tunnel_mplsip_failed; } +#endif err = rtnl_link_register(&ipip_link_ops); if (err < 0) goto rtnl_link_failed; @@ -543,8 +640,13 @@ out: return err; rtnl_link_failed: +#if IS_ENABLED(CONFIG_MPLS) + xfrm4_tunnel_deregister(&mplsip_handler, AF_INET); +xfrm_tunnel_mplsip_failed: + +#endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); -xfrm_tunnel_failed: +xfrm_tunnel_ipip_failed: unregister_pernet_device(&ipip_net_ops); goto out; } @@ -554,7 +656,10 @@ static void __exit ipip_fini(void) rtnl_link_unregister(&ipip_link_ops); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) pr_info("%s: can't deregister tunnel\n", __func__); - +#if IS_ENABLED(CONFIG_MPLS) + if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS)) + pr_info("%s: can't deregister tunnel\n", __func__); +#endif unregister_pernet_device(&ipip_net_ops); } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5ad48ec77..5f006e13d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -722,6 +722,7 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache cache->mfc_un.res.maxvif = vifi + 1; } } + cache->mfc_un.res.lastuse = jiffies; } static int vif_add(struct net *net, struct mr_table *mrt, @@ -1748,7 +1749,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->dev->stats.tx_bytes += skb->len; } - IPCB(skb)->flags |= IPSKB_FORWARDED; + IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output @@ -1792,6 +1793,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, vif = cache->mfc_parent; cache->mfc_un.res.pkt++; cache->mfc_un.res.bytes += skb->len; + cache->mfc_un.res.lastuse = jiffies; if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; @@ -2071,10 +2073,11 @@ drop: static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) { - int ct; - struct rtnexthop *nhp; - struct nlattr *mp_attr; struct rta_mfc_stats mfcs; + struct nlattr *mp_attr; + struct rtnexthop *nhp; + unsigned long lastuse; + int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mfc_parent >= MAXVIFS) @@ -2103,10 +2106,15 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0) + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), + RTA_PAD)) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; @@ -2115,7 +2123,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, - struct rtmsg *rtm, int nowait) + struct rtmsg *rtm, int nowait, u32 portid) { struct mfc_cache *cache; struct mr_table *mrt; @@ -2160,6 +2168,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, return -ENOMEM; } + NETLINK_CB(skb2).portid = portid; skb_push(skb2, sizeof(struct iphdr)); skb_reset_network_header(skb2); iph = ip_hdr(skb2); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 2033f929a..b31df597f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -89,22 +89,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr, __be32 src_ipaddr, tgt_ipaddr; long ret; -#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) - - if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, - ARPT_INV_ARPOP)) + if (NF_INVF(arpinfo, ARPT_INV_ARPOP, + (arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop)) return 0; - if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, - ARPT_INV_ARPHRD)) + if (NF_INVF(arpinfo, ARPT_INV_ARPHRD, + (arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd)) return 0; - if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, - ARPT_INV_ARPPRO)) + if (NF_INVF(arpinfo, ARPT_INV_ARPPRO, + (arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro)) return 0; - if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, - ARPT_INV_ARPHLN)) + if (NF_INVF(arpinfo, ARPT_INV_ARPHLN, + (arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln)) return 0; src_devaddr = arpptr; @@ -115,31 +113,32 @@ static inline int arp_packet_match(const struct arphdr *arphdr, arpptr += dev->addr_len; memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); - if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), - ARPT_INV_SRCDEVADDR) || - FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), - ARPT_INV_TGTDEVADDR)) + if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR, + arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, + dev->addr_len)) || + NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR, + arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, + dev->addr_len))) return 0; - if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, - ARPT_INV_SRCIP) || - FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), - ARPT_INV_TGTIP)) + if (NF_INVF(arpinfo, ARPT_INV_SRCIP, + (src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr) || + NF_INVF(arpinfo, ARPT_INV_TGTIP, + (tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr)) return 0; /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_IN)) + if (NF_INVF(arpinfo, ARPT_INV_VIA_IN, ret != 0)) return 0; ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) + if (NF_INVF(arpinfo, ARPT_INV_VIA_OUT, ret != 0)) return 0; return 1; -#undef FWINV } static inline int arp_checkentry(const struct arpt_arp *arp) @@ -300,23 +299,12 @@ static inline bool unconditional(const struct arpt_entry *e) memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } -static bool find_jump_target(const struct xt_table_info *t, - const struct arpt_entry *target) -{ - struct arpt_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -389,10 +377,11 @@ static int mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct arpt_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -544,6 +533,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { struct arpt_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -556,6 +546,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ @@ -566,17 +559,20 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - break; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(arpt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } if (ret != 0) - return ret; + goto out_free; + ret = -EINVAL; if (i != repl->num_entries) - return -EINVAL; + goto out_free; /* Check hooks all assigned */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { @@ -584,13 +580,16 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; if (newinfo->underflow[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -610,6 +609,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 54906e0e8..f993545a3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -58,32 +58,31 @@ ip_packet_match(const struct iphdr *ip, { unsigned long ret; -#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) - - if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, - IPT_INV_SRCIP) || - FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) + if (NF_INVF(ipinfo, IPT_INV_SRCIP, + (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) || + NF_INVF(ipinfo, IPT_INV_DSTIP, + (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr)) return false; ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_IN)) + if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_OUT)) + if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0)) return false; /* Check specific protocol */ if (ipinfo->proto && - FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) + NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto)) return false; /* If we have a fragment rule but the packet is not a fragment * then we return zero */ - if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) + if (NF_INVF(ipinfo, IPT_INV_FRAG, + (ipinfo->flags & IPT_F_FRAG) && !isfrag)) return false; return true; @@ -122,7 +121,6 @@ static inline bool unconditional(const struct ipt_entry *e) return e->target_offset == sizeof(struct ipt_entry) && memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; -#undef FWINV } /* for const-correctness */ @@ -375,23 +373,12 @@ ipt_do_table(struct sk_buff *skb, else return verdict; } -static bool find_jump_target(const struct xt_table_info *t, - const struct ipt_entry *target) -{ - struct ipt_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -460,10 +447,11 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct ipt_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -696,6 +684,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { struct ipt_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -708,6 +697,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -717,15 +709,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - return ret; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ipt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } + ret = -EINVAL; if (i != repl->num_entries) - return -EINVAL; + goto out_free; /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { @@ -733,13 +728,16 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; if (newinfo->underflow[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -759,6 +757,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 57fc97cda..aebdb337f 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -87,10 +87,6 @@ iptable_mangle_hook(void *priv, { if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state); - if (state->hook == NF_INET_POST_ROUTING) - return ipt_do_table(skb, state, - state->net->ipv4.iptable_mangle); - /* PREROUTING/INPUT/FORWARD: */ return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index c6f3c406f..63923710f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -26,6 +26,8 @@ struct ct_iter_state { struct seq_net_private p; + struct hlist_nulls_head *hash; + unsigned int htable_size; unsigned int bucket; }; @@ -35,10 +37,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < st->htable_size; st->bucket++) { n = rcu_dereference( - hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -53,11 +55,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= st->htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); } return head; } @@ -75,7 +77,11 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { + struct ct_iter_state *st = seq->private; + rcu_read_lock(); + + nf_conntrack_get_ht(&st->hash, &st->htable_size); return ct_get_idx(seq, *pos); } diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index b6ea57ec5..fd8220213 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -24,6 +24,9 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return NULL; + if (ip_hdr(oldskb)->protocol != IPPROTO_TCP) + return NULL; + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), sizeof(struct tcphdr), _oth); if (oth == NULL) diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 2375b0a8b..30493beb6 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv, __be32 saddr, daddr; u_int8_t tos; const struct iphdr *iph; + int err; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || @@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv, tos = iph->tos; ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || - iph->tos != tos) - if (ip_route_me_harder(state->net, skb, RTN_UNSPEC)) - ret = NF_DROP; + iph->tos != tos) { + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } } return ret; } diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index c24f41c81..2c2553b90 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = { .eval = nft_reject_ipv4_eval, .init = nft_reject_init, .dump = nft_reject_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a1f2830d8..62c3ed0b7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs) atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; - u32 delta = 0; + u32 new, delta = 0; if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, p_id) - segs; + /* Do not use atomic_add_return() as it makes UBSAN unhappy */ + do { + old = (u32)atomic_read(p_id); + new = old + delta + segs; + } while (atomic_cmpxchg(p_id, old, new) != old); + + return new - segs; } EXPORT_SYMBOL(ip_idents_reserve); @@ -2497,7 +2503,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, - r, nowait); + r, nowait, portid); + if (err <= 0) { if (!nowait) { if (err == 0) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 99fbc1479..2fa1a4a6d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2278,6 +2278,38 @@ static inline bool tcp_can_repair_sock(const struct sock *sk) ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); } +static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) +{ + struct tcp_repair_window opt; + + if (!tp->repair) + return -EPERM; + + if (len != sizeof(opt)) + return -EINVAL; + + if (copy_from_user(&opt, optbuf, sizeof(opt))) + return -EFAULT; + + if (opt.max_window < opt.snd_wnd) + return -EINVAL; + + if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) + return -EINVAL; + + if (after(opt.rcv_wup, tp->rcv_nxt)) + return -EINVAL; + + tp->snd_wl1 = opt.snd_wl1; + tp->snd_wnd = opt.snd_wnd; + tp->max_window = opt.max_window; + + tp->rcv_wnd = opt.rcv_wnd; + tp->rcv_wup = opt.rcv_wup; + + return 0; +} + static int tcp_repair_options_est(struct tcp_sock *tp, struct tcp_repair_opt __user *optbuf, unsigned int len) { @@ -2708,6 +2740,9 @@ stealth_integrity_out_1: else tp->tsoffset = val - tcp_time_stamp; break; + case TCP_REPAIR_WINDOW: + err = tcp_repair_set_window(tp, optval, optlen); + break; case TCP_NOTSENT_LOWAT: tp->notsent_lowat = val; sk->sk_write_space(sk); @@ -2976,6 +3011,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EINVAL; break; + case TCP_REPAIR_WINDOW: { + struct tcp_repair_window opt; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len != sizeof(opt)) + return -EINVAL; + + if (!tp->repair) + return -EPERM; + + opt.snd_wl1 = tp->snd_wl1; + opt.snd_wnd = tp->snd_wnd; + opt.max_window = tp->max_window; + opt.rcv_wnd = tp->rcv_wnd; + opt.rcv_wup = tp->rcv_wup; + + if (copy_to_user(optval, &opt, len)) + return -EFAULT; + return 0; + } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; @@ -3085,8 +3142,18 @@ static void __tcp_alloc_md5sig_pool(void) return; for_each_possible_cpu(cpu) { + void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch; struct ahash_request *req; + if (!scratch) { + scratch = kmalloc_node(sizeof(union tcp_md5sum_block) + + sizeof(struct tcphdr), + GFP_KERNEL, + cpu_to_node(cpu)); + if (!scratch) + return; + per_cpu(tcp_md5sig_pool, cpu).scratch = scratch; + } if (per_cpu(tcp_md5sig_pool, cpu).md5_req) continue; @@ -3242,7 +3309,6 @@ int tcp_abort(struct sock *sk, int err) local_bh_enable(); return 0; } - sock_gen_put(sk); return -EOPNOTSUPP; } @@ -3271,7 +3337,6 @@ int tcp_abort(struct sock *sk, int err) bh_unlock_sock(sk); local_bh_enable(); release_sock(sk); - sock_put(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 7e538f71f..10d728b68 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -293,7 +293,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - memset(info, 0, sizeof(struct tcp_dctcp_info)); + memset(&info->dctcp, 0, sizeof(info->dctcp)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_ce_state = (u16) ca->ce_state; @@ -303,7 +303,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, } *attr = INET_DIAG_DCTCPINFO; - return sizeof(*info); + return sizeof(info->dctcp); } return 0; } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 4d610934f..a748c74aa 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -54,11 +54,16 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, { struct net *net = sock_net(in_skb->sk); struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req); + int err; if (IS_ERR(sk)) return PTR_ERR(sk); - return sock_diag_destroy(sk, ECONNABORTED); + err = sock_diag_destroy(sk, ECONNABORTED); + + sock_gen_put(sk); + + return err; } #endif diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 478114b36..4e777a324 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -227,6 +227,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_fastopen_add_skb(child, skb); tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; + tp->rcv_wup = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cd3c4cbbb..9d8e4f7d2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2332,10 +2332,9 @@ static void DBGUNDO(struct sock *sk, const char *msg) } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, - &np->daddr, ntohs(inet->inet_dport), + &sk->sk_v6_daddr, ntohs(inet->inet_dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); @@ -3118,6 +3117,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; + u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3157,6 +3157,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!first_ackt.v64) first_ackt = last_ackt; + last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; @@ -3253,7 +3254,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = ca_rtt_us }; + .rtt_us = ca_rtt_us, + .in_flight = last_in_flight }; icsk->icsk_ca_ops->pkts_acked(sk, &sample); } @@ -5247,6 +5249,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool rst_seq_match = false; /* RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && @@ -5283,13 +5286,32 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* Step 2: check RST bit */ if (th->rst) { - /* RFC 5961 3.2 : - * If sequence number exactly matches RCV.NXT, then + /* RFC 5961 3.2 (extend to match against SACK too if available): + * If seq num matches RCV.NXT or the right-most SACK block, + * then * RESET the connection * else * Send a challenge ACK */ - if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + rst_seq_match = true; + } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int max_sack = sp[0].end_seq; + int this_sack; + + for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; + ++this_sack) { + max_sack = after(sp[this_sack].end_seq, + max_sack) ? + sp[this_sack].end_seq : max_sack; + } + + if (TCP_SKB_CB(skb)->seq == max_sack) + rst_seq_match = true; + } + + if (rst_seq_match) tcp_reset(sk); else tcp_send_challenge_ack(sk, skb); @@ -5949,7 +5971,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * so release it. */ if (req) { - tp->total_retrans = req->num_retrans; + inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for correct metrics. */ @@ -6211,6 +6233,9 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, kmemcheck_annotate_bitfield(ireq, flags); ireq->opt = NULL; +#if IS_ENABLED(CONFIG_IPV6) + ireq->pktopts = NULL; +#endif atomic64_set(&ireq->ir_cookie, 0); ireq->ireq_state = TCP_NEW_SYN_RECV; write_pnet(&ireq->ireq_net, sock_net(sk_listener)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 70dbd19fd..ba2ab6fcf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1040,27 +1040,28 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, GFP_KERNEL); } -static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, - __be32 daddr, __be32 saddr, int nbytes) +static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, + __be32 daddr, __be32 saddr, + const struct tcphdr *th, int nbytes) { struct tcp4_pseudohdr *bp; struct scatterlist sg; + struct tcphdr *_th; - bp = &hp->md5_blk.ip4; - - /* - * 1. the TCP pseudo-header (in the order: source IP address, - * destination IP address, zero-padded protocol number, and - * segment length) - */ + bp = hp->scratch; bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; bp->protocol = IPPROTO_TCP; bp->len = cpu_to_be16(nbytes); - sg_init_one(&sg, bp, sizeof(*bp)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + _th = (struct tcphdr *)(bp + 1); + memcpy(_th, th, sizeof(*th)); + _th->check = 0; + + sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, + sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->md5_req); } @@ -1077,9 +1078,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; @@ -1123,9 +1122,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) goto clear_hash; diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c new file mode 100644 index 000000000..5de82a8d4 --- /dev/null +++ b/net/ipv4/tcp_nv.c @@ -0,0 +1,476 @@ +/* + * TCP NV: TCP with Congestion Avoidance + * + * TCP-NV is a successor of TCP-Vegas that has been developed to + * deal with the issues that occur in modern networks. + * Like TCP-Vegas, TCP-NV supports true congestion avoidance, + * the ability to detect congestion before packet losses occur. + * When congestion (queue buildup) starts to occur, TCP-NV + * predicts what the cwnd size should be for the current + * throughput and it reduces the cwnd proportionally to + * the difference between the current cwnd and the predicted cwnd. + * + * NV is only recommeneded for traffic within a data center, and when + * all the flows are NV (at least those within the data center). This + * is due to the inherent unfairness between flows using losses to + * detect congestion (congestion control) and those that use queue + * buildup to detect congestion (congestion avoidance). + * + * Note: High NIC coalescence values may lower the performance of NV + * due to the increased noise in RTT values. In particular, we have + * seen issues with rx-frames values greater than 8. + * + * TODO: + * 1) Add mechanism to deal with reverse congestion. + */ + +#include +#include +#include +#include +#include + +/* TCP NV parameters + * + * nv_pad Max number of queued packets allowed in network + * nv_pad_buffer Do not grow cwnd if this closed to nv_pad + * nv_reset_period How often (in) seconds)to reset min_rtt + * nv_min_cwnd Don't decrease cwnd below this if there are no losses + * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected + * nv_ssthresh_factor On congestion set ssthresh to this * / 8 + * nv_rtt_factor RTT averaging factor + * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd + * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd + * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping + * slow-start due to congestion + * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion + * nv_rtt_min_cnt Wait these many RTTs before making congesion decision + * nv_cwnd_growth_rate_neg + * nv_cwnd_growth_rate_pos + * How quickly to double growth rate (not rate) of cwnd when not + * congested. One value (nv_cwnd_growth_rate_neg) for when + * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos) + * otherwise. + */ + +static int nv_pad __read_mostly = 10; +static int nv_pad_buffer __read_mostly = 2; +static int nv_reset_period __read_mostly = 5; /* in seconds */ +static int nv_min_cwnd __read_mostly = 2; +static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ +static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ +static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ +static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_cwnd_growth_rate_neg __read_mostly = 8; +static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ +static int nv_dec_eval_min_calls __read_mostly = 60; +static int nv_inc_eval_min_calls __read_mostly = 20; +static int nv_ssthresh_eval_min_calls __read_mostly = 30; +static int nv_stop_rtt_cnt __read_mostly = 10; +static int nv_rtt_min_cnt __read_mostly = 2; + +module_param(nv_pad, int, 0644); +MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network"); +module_param(nv_reset_period, int, 0644); +MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)"); +module_param(nv_min_cwnd, int, 0644); +MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value" + " without losses"); + +/* TCP NV Parameters */ +struct tcpnv { + unsigned long nv_min_rtt_reset_jiffies; /* when to switch to + * nv_min_rtt_new */ + s8 cwnd_growth_factor; /* Current cwnd growth factor, + * < 0 => less than 1 packet/RTT */ + u8 available8; + u16 available16; + u32 loss_cwnd; /* cwnd at last loss */ + u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ + nv_reset:1, /* whether to reset values */ + nv_catchup:1; /* whether we are growing because + * of temporary cwnd decrease */ + u8 nv_eval_call_cnt; /* call count since last eval */ + u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is + * smaller than this. It may grow to handle + * TSO, LRO and interrupt coalescence because + * with these a small cwnd cannot saturate + * the link. Note that this is different from + * the file local nv_min_cwnd */ + u8 nv_rtt_cnt; /* RTTs without making ca decision */; + u32 nv_last_rtt; /* last rtt */ + u32 nv_min_rtt; /* active min rtt. Used to determine slope */ + u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_rtt_max_rate; /* max rate seen during current RTT */ + u32 nv_rtt_start_seq; /* current RTT ends when packet arrives + * acking beyond nv_rtt_start_seq */ + u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is + * used to determine bytes acked since last + * call to bictcp_acked */ + u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */ +}; + +#define NV_INIT_RTT U32_MAX +#define NV_MIN_CWND 4 +#define NV_MIN_CWND_GROW 2 +#define NV_TSO_CWND_BOUND 80 + +static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + ca->nv_reset = 0; + ca->loss_cwnd = 0; + ca->nv_no_cong_cnt = 0; + ca->nv_rtt_cnt = 0; + ca->nv_last_rtt = 0; + ca->nv_rtt_max_rate = 0; + ca->nv_rtt_start_seq = tp->snd_una; + ca->nv_eval_call_cnt = 0; + ca->nv_last_snd_una = tp->snd_una; +} + +static void tcpnv_init(struct sock *sk) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + tcpnv_reset(ca, sk); + + ca->nv_allow_cwnd_growth = 1; + ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; + ca->nv_min_rtt = NV_INIT_RTT; + ca->nv_min_rtt_new = NV_INIT_RTT; + ca->nv_min_cwnd = NV_MIN_CWND; + ca->nv_catchup = 0; + ca->cwnd_growth_factor = 0; +} + +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + u32 cnt; + + if (!tcp_is_cwnd_limited(sk)) + return; + + /* Only grow cwnd if NV has not detected congestion */ + if (!ca->nv_allow_cwnd_growth) + return; + + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } + + if (ca->cwnd_growth_factor < 0) { + cnt = tp->snd_cwnd << -ca->cwnd_growth_factor; + tcp_cong_avoid_ai(tp, cnt, acked); + } else { + cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor); + tcp_cong_avoid_ai(tp, cnt, acked); + } +} + +static u32 tcpnv_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + + ca->loss_cwnd = tp->snd_cwnd; + return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); +} + +static u32 tcpnv_undo_cwnd(struct sock *sk) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + +static void tcpnv_state(struct sock *sk, u8 new_state) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + if (new_state == TCP_CA_Open && ca->nv_reset) { + tcpnv_reset(ca, sk); + } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR || + new_state == TCP_CA_Recovery) { + ca->nv_reset = 1; + ca->nv_allow_cwnd_growth = 0; + if (new_state == TCP_CA_Loss) { + /* Reset cwnd growth factor to Reno value */ + if (ca->cwnd_growth_factor > 0) + ca->cwnd_growth_factor = 0; + /* Decrease growth rate if allowed */ + if (nv_cwnd_growth_rate_neg > 0 && + ca->cwnd_growth_factor > -8) + ca->cwnd_growth_factor--; + } + } +} + +/* Do congestion avoidance calculations for TCP-NV + */ +static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + unsigned long now = jiffies; + s64 rate64 = 0; + u32 rate, max_win, cwnd_by_slope; + u32 avg_rtt; + u32 bytes_acked = 0; + + /* Some calls are for duplicates without timetamps */ + if (sample->rtt_us < 0) + return; + + /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */ + if (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Disorder) + return; + + /* Stop cwnd growth if we were in catch up mode */ + if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) { + ca->nv_catchup = 0; + ca->nv_allow_cwnd_growth = 0; + } + + bytes_acked = tp->snd_una - ca->nv_last_snd_una; + ca->nv_last_snd_una = tp->snd_una; + + if (sample->in_flight == 0) + return; + + /* Calculate moving average of RTT */ + if (nv_rtt_factor > 0) { + if (ca->nv_last_rtt > 0) { + avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor + + ((u64)ca->nv_last_rtt) + * (256 - nv_rtt_factor)) >> 8; + } else { + avg_rtt = sample->rtt_us; + ca->nv_min_rtt = avg_rtt << 1; + } + ca->nv_last_rtt = avg_rtt; + } else { + avg_rtt = sample->rtt_us; + } + + /* rate in 100's bits per second */ + rate64 = ((u64)sample->in_flight) * 8000000; + rate = (u32)div64_u64(rate64, (u64)(avg_rtt * 100)); + + /* Remember the maximum rate seen during this RTT + * Note: It may be more than one RTT. This function should be + * called at least nv_dec_eval_min_calls times. + */ + if (ca->nv_rtt_max_rate < rate) + ca->nv_rtt_max_rate = rate; + + /* We have valid information, increment counter */ + if (ca->nv_eval_call_cnt < 255) + ca->nv_eval_call_cnt++; + + /* update min rtt if necessary */ + if (avg_rtt < ca->nv_min_rtt) + ca->nv_min_rtt = avg_rtt; + + /* update future min_rtt if necessary */ + if (avg_rtt < ca->nv_min_rtt_new) + ca->nv_min_rtt_new = avg_rtt; + + /* nv_min_rtt is updated with the minimum (possibley averaged) rtt + * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a + * warm reset). This new nv_min_rtt will be continued to be updated + * and be used for another sysctl_tcp_nv_reset_period seconds, + * when it will be updated again. + * In practice we introduce some randomness, so the actual period used + * is chosen randomly from the range: + * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4) + */ + if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) { + unsigned char rand; + + ca->nv_min_rtt = ca->nv_min_rtt_new; + ca->nv_min_rtt_new = NV_INIT_RTT; + get_random_bytes(&rand, 1); + ca->nv_min_rtt_reset_jiffies = + now + ((nv_reset_period * (384 + rand) * HZ) >> 9); + /* Every so often we decrease ca->nv_min_cwnd in case previous + * value is no longer accurate. + */ + ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND); + } + + /* Once per RTT check if we need to do congestion avoidance */ + if (before(ca->nv_rtt_start_seq, tp->snd_una)) { + ca->nv_rtt_start_seq = tp->snd_nxt; + if (ca->nv_rtt_cnt < 0xff) + /* Increase counter for RTTs without CA decision */ + ca->nv_rtt_cnt++; + + /* If this function is only called once within an RTT + * the cwnd is probably too small (in some cases due to + * tso, lro or interrupt coalescence), so we increase + * ca->nv_min_cwnd. + */ + if (ca->nv_eval_call_cnt == 1 && + bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache && + ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) { + ca->nv_min_cwnd = min(ca->nv_min_cwnd + + NV_MIN_CWND_GROW, + NV_TSO_CWND_BOUND + 1); + ca->nv_rtt_start_seq = tp->snd_nxt + + ca->nv_min_cwnd * tp->mss_cache; + ca->nv_eval_call_cnt = 0; + ca->nv_allow_cwnd_growth = 1; + return; + } + + /* Find the ideal cwnd for current rate from slope + * slope = 80000.0 * mss / nv_min_rtt + * cwnd_by_slope = nv_rtt_max_rate / slope + */ + cwnd_by_slope = (u32) + div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, + (u64)(80000 * tp->mss_cache)); + max_win = cwnd_by_slope + nv_pad; + + /* If cwnd > max_win, decrease cwnd + * if cwnd < max_win, grow cwnd + * else leave the same + */ + if (tp->snd_cwnd > max_win) { + /* there is congestion, check that it is ok + * to make a CA decision + * 1. We should have at least nv_dec_eval_min_calls + * data points before making a CA decision + * 2. We only make a congesion decision after + * nv_rtt_min_cnt RTTs + */ + if (ca->nv_rtt_cnt < nv_rtt_min_cnt) { + return; + } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { + if (ca->nv_eval_call_cnt < + nv_ssthresh_eval_min_calls) + return; + /* otherwise we will decrease cwnd */ + } else if (ca->nv_eval_call_cnt < + nv_dec_eval_min_calls) { + if (ca->nv_allow_cwnd_growth && + ca->nv_rtt_cnt > nv_stop_rtt_cnt) + ca->nv_allow_cwnd_growth = 0; + return; + } + + /* We have enough data to determine we are congested */ + ca->nv_allow_cwnd_growth = 0; + tp->snd_ssthresh = + (nv_ssthresh_factor * max_win) >> 3; + if (tp->snd_cwnd - max_win > 2) { + /* gap > 2, we do exponential cwnd decrease */ + int dec; + + dec = max(2U, ((tp->snd_cwnd - max_win) * + nv_cong_dec_mult) >> 7); + tp->snd_cwnd -= dec; + } else if (nv_cong_dec_mult > 0) { + tp->snd_cwnd = max_win; + } + if (ca->cwnd_growth_factor > 0) + ca->cwnd_growth_factor = 0; + ca->nv_no_cong_cnt = 0; + } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) { + /* There is no congestion, grow cwnd if allowed*/ + if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls) + return; + + ca->nv_allow_cwnd_growth = 1; + ca->nv_no_cong_cnt++; + if (ca->cwnd_growth_factor < 0 && + nv_cwnd_growth_rate_neg > 0 && + ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) { + ca->cwnd_growth_factor++; + ca->nv_no_cong_cnt = 0; + } else if (ca->cwnd_growth_factor >= 0 && + nv_cwnd_growth_rate_pos > 0 && + ca->nv_no_cong_cnt > + nv_cwnd_growth_rate_pos) { + ca->cwnd_growth_factor++; + ca->nv_no_cong_cnt = 0; + } + } else { + /* cwnd is in-between, so do nothing */ + return; + } + + /* update state */ + ca->nv_eval_call_cnt = 0; + ca->nv_rtt_cnt = 0; + ca->nv_rtt_max_rate = 0; + + /* Don't want to make cwnd < nv_min_cwnd + * (it wasn't before, if it is now is because nv + * decreased it). + */ + if (tp->snd_cwnd < nv_min_cwnd) + tp->snd_cwnd = nv_min_cwnd; + } +} + +/* Extract info for Tcp socket info provided via netlink */ +size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + const struct tcpnv *ca = inet_csk_ca(sk); + + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt; + info->vegas.tcpv_rtt = ca->nv_last_rtt; + info->vegas.tcpv_minrtt = ca->nv_min_rtt; + + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); + } + return 0; +} +EXPORT_SYMBOL_GPL(tcpnv_get_info); + +static struct tcp_congestion_ops tcpnv __read_mostly = { + .init = tcpnv_init, + .ssthresh = tcpnv_recalc_ssthresh, + .cong_avoid = tcpnv_cong_avoid, + .set_state = tcpnv_state, + .undo_cwnd = tcpnv_undo_cwnd, + .pkts_acked = tcpnv_acked, + .get_info = tcpnv_get_info, + + .owner = THIS_MODULE, + .name = "nv", +}; + +static int __init tcpnv_register(void) +{ + BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE); + + return tcp_register_congestion_control(&tcpnv); +} + +static void __exit tcpnv_unregister(void) +{ + tcp_unregister_congestion_control(&tcpnv); +} + +module_init(tcpnv_register); +module_exit(tcpnv_unregister); + +MODULE_AUTHOR("Lawrence Brakmo"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP NV"); +MODULE_VERSION("1.0"); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 30a7dd4f6..ac98740c5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -912,9 +912,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); + tp = tcp_sk(sk); if (clone_it) { skb_mstamp_get(&skb->skb_mstamp); + TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq + - tp->snd_una; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -925,7 +928,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } inet = inet_sk(sk); - tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); @@ -1971,12 +1973,14 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); - if (nskb->ip_summed) + if (nskb->ip_summed) { skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); - else - nskb->csum = skb_copy_and_csum_bits(skb, 0, - skb_put(nskb, copy), - copy, nskb->csum); + } else { + __wsum csum = skb_copy_and_csum_bits(skb, 0, + skb_put(nskb, copy), + copy, 0); + nskb->csum = csum_block_add(nskb->csum, csum, len); + } if (skb->len <= copy) { /* We've eaten all the data from this skb. @@ -2610,7 +2614,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > - min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) + min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), + sk->sk_sndbuf)) return -EAGAIN; if (skb_still_in_host_queue(sk, skb)) @@ -2835,7 +2840,7 @@ begin_fwd: if (tcp_retransmit_skb(sk, skb, segs)) return; - NET_INC_STATS(sock_net(sk), mib_idx); + NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); @@ -3580,6 +3585,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + if (unlikely(tcp_passive_fastopen(sk))) + tcp_sk(sk)->total_retrans++; } return res; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index debdd8b33..f712b411f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -24,6 +24,13 @@ int sysctl_tcp_thin_linear_timeouts __read_mostly; +/** + * tcp_write_err() - close socket and save error info + * @sk: The socket the error has appeared on. + * + * Returns: Nothing (void) + */ + static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -33,16 +40,21 @@ static void tcp_write_err(struct sock *sk) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } -/* Do not allow orphaned sockets to eat all our resources. - * This is direct violation of TCP specs, but it is required - * to prevent DoS attacks. It is called when a retransmission timeout - * or zero probe timeout occurs on orphaned socket. +/** + * tcp_out_of_resources() - Close socket if out of resources + * @sk: pointer to current socket + * @do_reset: send a last packet with reset flag * - * Criteria is still not confirmed experimentally and may change. - * We kill the socket, if: - * 1. If number of orphaned sockets exceeds an administratively configured - * limit. - * 2. If we have strong memory pressure. + * Do not allow orphaned sockets to eat all our resources. + * This is direct violation of TCP specs, but it is required + * to prevent DoS attacks. It is called when a retransmission timeout + * or zero probe timeout occurs on orphaned socket. + * + * Criteria is still not confirmed experimentally and may change. + * We kill the socket, if: + * 1. If number of orphaned sockets exceeds an administratively configured + * limit. + * 2. If we have strong memory pressure. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { @@ -74,7 +86,11 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) return 0; } -/* Calculate maximal number or retries on an orphaned socket. */ +/** + * tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket + * @sk: Pointer to the current socket. + * @alive: bool, socket alive state + */ static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ @@ -115,10 +131,22 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) } } -/* This function calculates a "timeout" which is equivalent to the timeout of a - * TCP connection after "boundary" unsuccessful, exponentially backed-off + +/** + * retransmits_timed_out() - returns true if this connection has timed out + * @sk: The current socket + * @boundary: max number of retransmissions + * @timeout: A custom timeout value. + * If set to 0 the default timeout is calculated and used. + * Using TCP_RTO_MIN and the number of unsuccessful retransmits. + * @syn_set: true if the SYN Bit was set. + * + * The default "timeout" value this function can calculate and use + * is equivalent to the timeout of a TCP Connection + * after "boundary" unsuccessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if * syn_set flag is set. + * */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, @@ -257,6 +285,16 @@ out: sk_mem_reclaim(sk); } + +/** + * tcp_delack_timer() - The TCP delayed ACK timeout handler + * @data: Pointer to the current socket. (gets casted to struct sock *) + * + * This function gets (indirectly) called when the kernel timer for a TCP packet + * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. + * + * Returns: Nothing (void) + */ static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock *)data; @@ -346,14 +384,23 @@ static void tcp_fastopen_synack_timer(struct sock *sk) */ inet_rtx_syn_ack(sk, req); req->num_timeout++; + icsk->icsk_retransmits++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); } -/* - * The TCP retransmit timer. - */ +/** + * tcp_retransmit_timer() - The TCP retransmit timeout handler + * @sk: Pointer to the current socket. + * + * This function gets called when the kernel timer for a TCP packet + * of this socket expires. + * + * It handles retransmission, timer adjustment and other necesarry measures. + * + * Returns: Nothing (void) + */ void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -494,7 +541,8 @@ out_reset_timer: out:; } -/* Called with BH disabled */ +/* Called with bottom-half processing disabled. + Called by tcp_write_timer() */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -539,7 +587,7 @@ static void tcp_write_timer(unsigned long data) if (!sock_owned_by_user(sk)) { tcp_write_timer_handler(sk); } else { - /* deleguate our work to tcp_release_cb() */ + /* delegate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) sock_hold(sk); } diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 0d0171830..ec35eaa5c 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,11 +17,14 @@ static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { - return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; + return (family == AF_INET) ? &tunnel4_handlers : + (family == AF_INET6) ? &tunnel64_handlers : + &tunnelmpls4_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) @@ -125,6 +129,26 @@ drop: } #endif +#if IS_ENABLED(CONFIG_MPLS) +static int tunnelmpls4_rcv(struct sk_buff *skb) +{ + struct xfrm_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct mpls_label))) + goto drop; + + for_each_tunnel_rcu(tunnelmpls4_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} +#endif + static void tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; @@ -145,6 +169,17 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) } #endif +#if IS_ENABLED(CONFIG_MPLS) +static void tunnelmpls4_err(struct sk_buff *skb, u32 info) +{ + struct xfrm_tunnel *handler; + + for_each_tunnel_rcu(tunnelmpls4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} +#endif + static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, @@ -161,24 +196,47 @@ static const struct net_protocol tunnel64_protocol = { }; #endif +#if IS_ENABLED(CONFIG_MPLS) +static const struct net_protocol tunnelmpls4_protocol = { + .handler = tunnelmpls4_rcv, + .err_handler = tunnelmpls4_err, + .no_policy = 1, + .netns_ok = 1, +}; +#endif + static int __init tunnel4_init(void) { - if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) { - pr_err("%s: can't add protocol\n", __func__); - return -EAGAIN; - } + if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) + goto err; #if IS_ENABLED(CONFIG_IPV6) if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { - pr_err("tunnel64 init: can't add protocol\n"); inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); - return -EAGAIN; + goto err; + } +#endif +#if IS_ENABLED(CONFIG_MPLS) + if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) { + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); +#if IS_ENABLED(CONFIG_IPV6) + inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); +#endif + goto err; } #endif return 0; + +err: + pr_err("%s: can't add protocol\n", __func__); + return -EAGAIN; } static void __exit tunnel4_fini(void) { +#if IS_ENABLED(CONFIG_MPLS) + if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) + pr_err("tunnelmpls4 close: can't remove protocol\n"); +#endif #if IS_ENABLED(CONFIG_IPV6) if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) pr_err("tunnel64 close: can't remove protocol\n"); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 00d18c57c..5fdcb8d10 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2216,7 +2216,6 @@ struct proto udp_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 47f12c73d..58bd39fb1 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -76,6 +76,67 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); +void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + if (!dev->netdev_ops->ndo_udp_tunnel_add) + return; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); + +/* Notify netdevs that UDP port started listening */ +void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_add) + continue; + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); + +/* Notify netdevs that UDP port is no more listening */ +void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_del) + continue; + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); + void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 3b3efbda4..2eea073e2 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -55,7 +55,6 @@ struct proto udplite_prot = { .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, .obj_size = sizeof(struct udp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7b0edb37a..41f5b504a 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -29,7 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; - fl4->flowi4_oif = oif; + fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); if (saddr) fl4->saddr = saddr->a4; @@ -295,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = { { } }; -static int __net_init xfrm4_net_sysctl_init(struct net *net) +static __net_init int xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -323,7 +323,7 @@ err_alloc: return -ENOMEM; } -static void __net_exit xfrm4_net_sysctl_exit(struct net *net) +static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -336,12 +336,12 @@ static void __net_exit xfrm4_net_sysctl_exit(struct net *net) kfree(table); } #else /* CONFIG_SYSCTL */ -static int inline xfrm4_net_sysctl_init(struct net *net) +static inline int xfrm4_net_sysctl_init(struct net *net) { return 0; } -static void inline xfrm4_net_sysctl_exit(struct net *net) +static inline void xfrm4_net_sysctl_exit(struct net *net) { } #endif diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 6d8ea0992..c174ccb34 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -22,6 +22,7 @@ ipv6-$(CONFIG_NETFILTER) += netfilter.o ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_PROC_FS) += proc.o ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o +ipv6-$(CONFIG_NETLABEL) += calipso.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 82e367b9e..2f1f5d439 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -547,7 +547,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; @@ -559,7 +559,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err); @@ -778,7 +778,14 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) } if (p == &net->ipv6.devconf_all->forwarding) { + int old_dflt = net->ipv6.devconf_dflt->forwarding; + net->ipv6.devconf_dflt->forwarding = newf; + if ((!newf) ^ (!old_dflt)) + inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + addrconf_forward_change(net, newf); if ((!newf) ^ (!old)) inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, @@ -1524,6 +1531,28 @@ out: return hiscore_idx; } +static int ipv6_get_saddr_master(struct net *net, + const struct net_device *dst_dev, + const struct net_device *master, + struct ipv6_saddr_dst *dst, + struct ipv6_saddr_score *scores, + int hiscore_idx) +{ + struct inet6_dev *idev; + + idev = __in6_dev_get(dst_dev); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev, + scores, hiscore_idx); + + idev = __in6_dev_get(master); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev, + scores, hiscore_idx); + + return hiscore_idx; +} + int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) @@ -1577,13 +1606,39 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, if (idev) hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } else { + const struct net_device *master; + int master_idx = 0; + + /* if dst_dev exists and is enslaved to an L3 device, then + * prefer addresses from dst_dev and then the master over + * any other enslaved devices in the L3 domain. + */ + master = l3mdev_master_dev_rcu(dst_dev); + if (master) { + master_idx = master->ifindex; + + hiscore_idx = ipv6_get_saddr_master(net, dst_dev, + master, &dst, + scores, hiscore_idx); + + if (scores[hiscore_idx].ifa) + goto out; + } + for_each_netdev_rcu(net, dev) { + /* only consider addresses on devices in the + * same L3 domain + */ + if (l3mdev_master_ifindex_rcu(dev) != master_idx) + continue; idev = __in6_dev_get(dev); if (!idev) continue; hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } } + +out: rcu_read_unlock(); hiscore = &scores[hiscore_idx]; @@ -1824,7 +1879,6 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp) void addrconf_dad_failure(struct inet6_ifaddr *ifp) { - struct in6_addr addr; struct inet6_dev *idev = ifp->idev; struct net *net = dev_net(ifp->idev->dev); @@ -1886,18 +1940,6 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) in6_ifa_put(ifp2); lock_errdad: spin_lock_bh(&ifp->lock); - } else if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { - addr.s6_addr32[0] = htonl(0xfe800000); - addr.s6_addr32[1] = 0; - - if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && - ipv6_addr_equal(&ifp->addr, &addr)) { - /* DAD failed for link-local based on MAC address */ - idev->cnf.disable_ipv6 = 1; - - pr_info("%s: IPv6 being disabled!\n", - ifp->idev->dev->name); - } } errdad: @@ -2255,7 +2297,7 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) return ERR_PTR(-EACCES); /* Add default multicast route */ - if (!(dev->flags & IFF_LOOPBACK)) + if (!(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev)) addrconf_add_mroute(dev); return idev; @@ -2334,12 +2376,109 @@ static bool is_addr_mode_generate_stable(struct inet6_dev *idev) idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; } +int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + const struct in6_addr *addr, int addr_type, + u32 addr_flags, bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft) +{ + struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); + int create = 0, update_lft = 0; + + if (!ifp && valid_lft) { + int max_addresses = in6_dev->cnf.max_addresses; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (in6_dev->cnf.optimistic_dad && + !net->ipv6.devconf_all->forwarding && sllao) + addr_flags |= IFA_F_OPTIMISTIC; +#endif + + /* Do not allow to create too much of autoconfigured + * addresses; this would be too easy way to crash kernel. + */ + if (!max_addresses || + ipv6_count_addresses(in6_dev) < max_addresses) + ifp = ipv6_add_addr(in6_dev, addr, NULL, + pinfo->prefix_len, + addr_type&IPV6_ADDR_SCOPE_MASK, + addr_flags, valid_lft, + prefered_lft); + + if (IS_ERR_OR_NULL(ifp)) + return -1; + + update_lft = 0; + create = 1; + spin_lock_bh(&ifp->lock); + ifp->flags |= IFA_F_MANAGETEMPADDR; + ifp->cstamp = jiffies; + ifp->tokenized = tokenized; + spin_unlock_bh(&ifp->lock); + addrconf_dad_start(ifp); + } + + if (ifp) { + u32 flags; + unsigned long now; + u32 stored_lft; + + /* update lifetime (RFC2462 5.5.3 e) */ + spin_lock_bh(&ifp->lock); + now = jiffies; + if (ifp->valid_lft > (now - ifp->tstamp) / HZ) + stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; + else + stored_lft = 0; + if (!update_lft && !create && stored_lft) { + const u32 minimum_lft = min_t(u32, + stored_lft, MIN_VALID_LIFETIME); + valid_lft = max(valid_lft, minimum_lft); + + /* RFC4862 Section 5.5.3e: + * "Note that the preferred lifetime of the + * corresponding address is always reset to + * the Preferred Lifetime in the received + * Prefix Information option, regardless of + * whether the valid lifetime is also reset or + * ignored." + * + * So we should always update prefered_lft here. + */ + update_lft = 1; + } + + if (update_lft) { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = now; + flags = ifp->flags; + ifp->flags &= ~IFA_F_DEPRECATED; + spin_unlock_bh(&ifp->lock); + + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + } else + spin_unlock_bh(&ifp->lock); + + manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, + create, now); + + in6_ifa_put(ifp); + addrconf_verify(); + } + + return 0; +} +EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr); + void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; __u32 valid_lft; __u32 prefered_lft; - int addr_type; + int addr_type, err; u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); @@ -2433,10 +2572,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) /* Try to figure out our local address for this prefix */ if (pinfo->autoconf && in6_dev->cnf.autoconf) { - struct inet6_ifaddr *ifp; struct in6_addr addr; - int create = 0, update_lft = 0; - bool tokenized = false; + bool tokenized = false, dev_addr_generated = false; if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); @@ -2454,106 +2591,36 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) goto ok; } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { - in6_dev_put(in6_dev); - return; + goto put; + } else { + dev_addr_generated = true; } goto ok; } net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n", pinfo->prefix_len); - in6_dev_put(in6_dev); - return; + goto put; ok: + err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + &addr, addr_type, + addr_flags, sllao, + tokenized, valid_lft, + prefered_lft); + if (err) + goto put; - ifp = ipv6_get_ifaddr(net, &addr, dev, 1); - - if (!ifp && valid_lft) { - int max_addresses = in6_dev->cnf.max_addresses; - -#ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && - !net->ipv6.devconf_all->forwarding && sllao) - addr_flags |= IFA_F_OPTIMISTIC; -#endif - - /* Do not allow to create too much of autoconfigured - * addresses; this would be too easy way to crash kernel. - */ - if (!max_addresses || - ipv6_count_addresses(in6_dev) < max_addresses) - ifp = ipv6_add_addr(in6_dev, &addr, NULL, - pinfo->prefix_len, - addr_type&IPV6_ADDR_SCOPE_MASK, - addr_flags, valid_lft, - prefered_lft); - - if (IS_ERR_OR_NULL(ifp)) { - in6_dev_put(in6_dev); - return; - } - - update_lft = 0; - create = 1; - spin_lock_bh(&ifp->lock); - ifp->flags |= IFA_F_MANAGETEMPADDR; - ifp->cstamp = jiffies; - ifp->tokenized = tokenized; - spin_unlock_bh(&ifp->lock); - addrconf_dad_start(ifp); - } - - if (ifp) { - u32 flags; - unsigned long now; - u32 stored_lft; - - /* update lifetime (RFC2462 5.5.3 e) */ - spin_lock_bh(&ifp->lock); - now = jiffies; - if (ifp->valid_lft > (now - ifp->tstamp) / HZ) - stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; - else - stored_lft = 0; - if (!update_lft && !create && stored_lft) { - const u32 minimum_lft = min_t(u32, - stored_lft, MIN_VALID_LIFETIME); - valid_lft = max(valid_lft, minimum_lft); - - /* RFC4862 Section 5.5.3e: - * "Note that the preferred lifetime of the - * corresponding address is always reset to - * the Preferred Lifetime in the received - * Prefix Information option, regardless of - * whether the valid lifetime is also reset or - * ignored." - * - * So we should always update prefered_lft here. - */ - update_lft = 1; - } - - if (update_lft) { - ifp->valid_lft = valid_lft; - ifp->prefered_lft = prefered_lft; - ifp->tstamp = now; - flags = ifp->flags; - ifp->flags &= ~IFA_F_DEPRECATED; - spin_unlock_bh(&ifp->lock); - - if (!(flags&IFA_F_TENTATIVE)) - ipv6_ifa_notify(0, ifp); - } else - spin_unlock_bh(&ifp->lock); - - manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, - create, now); - - in6_ifa_put(ifp); - addrconf_verify(); - } + /* Ignore error case here because previous prefix add addr was + * successful which will be notified. + */ + ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr, + addr_type, addr_flags, sllao, + tokenized, valid_lft, + prefered_lft, + dev_addr_generated); } inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); +put: in6_dev_put(in6_dev); } @@ -2948,8 +3015,8 @@ static void init_loopback(struct net_device *dev) } } -static void addrconf_add_linklocal(struct inet6_dev *idev, - const struct in6_addr *addr, u32 flags) +void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags) { struct inet6_ifaddr *ifp; u32 addr_flags = flags | IFA_F_PERMANENT; @@ -2968,6 +3035,7 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, in6_ifa_put(ifp); } } +EXPORT_SYMBOL_GPL(addrconf_add_linklocal); static bool ipv6_reserved_interfaceid(struct in6_addr address) { @@ -3551,8 +3619,7 @@ restart: state = ifa->state; ifa->state = INET6_IFADDR_STATE_DEAD; - list_del(&ifa->if_list); - list_add(&ifa->if_list, &del_list); + list_move(&ifa->if_list, &del_list); } spin_unlock_bh(&ifa->lock); @@ -3749,6 +3816,7 @@ static void addrconf_dad_work(struct work_struct *w) dad_work); struct inet6_dev *idev = ifp->idev; struct in6_addr mcaddr; + bool disable_ipv6 = false; enum { DAD_PROCESS, @@ -3765,6 +3833,24 @@ static void addrconf_dad_work(struct work_struct *w) } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) { action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; + + if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + !(ifp->flags & IFA_F_STABLE_PRIVACY)) { + struct in6_addr addr; + + addr.s6_addr32[0] = htonl(0xfe800000); + addr.s6_addr32[1] = 0; + + if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && + ipv6_addr_equal(&ifp->addr, &addr)) { + /* DAD failed for link-local based on MAC */ + idev->cnf.disable_ipv6 = 1; + + pr_info("%s: IPv6 being disabled!\n", + ifp->idev->dev->name); + disable_ipv6 = true; + } + } } spin_unlock_bh(&ifp->lock); @@ -3774,6 +3860,8 @@ static void addrconf_dad_work(struct work_struct *w) } else if (action == DAD_ABORT) { in6_ifa_hold(ifp); addrconf_dad_stop(ifp, 1); + if (disable_ipv6) + addrconf_ifdown(idev->dev, 0); goto out; } @@ -5946,7 +6034,7 @@ static const struct ctl_table addrconf_sysctl[] = { static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct inet6_dev *idev, struct ipv6_devconf *p) { - int i; + int i, ifindex; struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; @@ -5966,6 +6054,13 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, if (!p->sysctl_header) goto free; + if (!strcmp(dev_name, "all")) + ifindex = NETCONFA_IFINDEX_ALL; + else if (!strcmp(dev_name, "default")) + ifindex = NETCONFA_IFINDEX_DEFAULT; + else + ifindex = idev->dev->ifindex; + inet6_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p); return 0; free: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index bfa86f040..b454055ba 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -60,6 +60,7 @@ #ifdef CONFIG_IPV6_TUNNEL #include #endif +#include #include #include @@ -92,6 +93,12 @@ MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); +bool ipv6_mod_enabled(void) +{ + return disable_ipv6_mod == 0; +} +EXPORT_SYMBOL_GPL(ipv6_mod_enabled); + static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); @@ -977,6 +984,10 @@ static int __init inet6_init(void) if (err) goto pingv6_fail; + err = calipso_init(); + if (err) + goto calipso_fail; + #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) @@ -987,8 +998,10 @@ out: #ifdef CONFIG_SYSCTL sysctl_fail: - pingv6_exit(); + calipso_exit(); #endif +calipso_fail: + pingv6_exit(); pingv6_fail: ipv6_packet_cleanup(); ipv6_packet_fail: diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c new file mode 100644 index 000000000..37ac9de71 --- /dev/null +++ b/net/ipv6/calipso.c @@ -0,0 +1,1475 @@ +/* + * CALIPSO - Common Architecture Label IPv6 Security Option + * + * This is an implementation of the CALIPSO protocol as specified in + * RFC 5570. + * + * Authors: Paul Moore + * Huw Davies + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 + * (c) Copyright Huw Davies , 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Maximium size of the calipso option including + * the two-byte TLV header. + */ +#define CALIPSO_OPT_LEN_MAX (2 + 252) + +/* Size of the minimum calipso option including + * the two-byte TLV header. + */ +#define CALIPSO_HDR_LEN (2 + 8) + +/* Maximium size of the calipso option including + * the two-byte TLV header and upto 3 bytes of + * leading pad and 7 bytes of trailing pad. + */ +#define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) + + /* Maximium size of u32 aligned buffer required to hold calipso + * option. Max of 3 initial pad bytes starting from buffer + 3. + * i.e. the worst case is when the previous tlv finishes on 4n + 3. + */ +#define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX) + +/* List of available DOI definitions */ +static DEFINE_SPINLOCK(calipso_doi_list_lock); +static LIST_HEAD(calipso_doi_list); + +/* Label mapping cache */ +int calipso_cache_enabled = 1; +int calipso_cache_bucketsize = 10; +#define CALIPSO_CACHE_BUCKETBITS 7 +#define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS) +#define CALIPSO_CACHE_REORDERLIMIT 10 +struct calipso_map_cache_bkt { + spinlock_t lock; + u32 size; + struct list_head list; +}; + +struct calipso_map_cache_entry { + u32 hash; + unsigned char *key; + size_t key_len; + + struct netlbl_lsm_cache *lsm_data; + + u32 activity; + struct list_head list; +}; + +static struct calipso_map_cache_bkt *calipso_cache; + +/* Label Mapping Cache Functions + */ + +/** + * calipso_cache_entry_free - Frees a cache entry + * @entry: the entry to free + * + * Description: + * This function frees the memory associated with a cache entry including the + * LSM cache data if there are no longer any users, i.e. reference count == 0. + * + */ +static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry) +{ + if (entry->lsm_data) + netlbl_secattr_cache_free(entry->lsm_data); + kfree(entry->key); + kfree(entry); +} + +/** + * calipso_map_cache_hash - Hashing function for the CALIPSO cache + * @key: the hash key + * @key_len: the length of the key in bytes + * + * Description: + * The CALIPSO tag hashing function. Returns a 32-bit hash value. + * + */ +static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +/** + * calipso_cache_init - Initialize the CALIPSO cache + * + * Description: + * Initializes the CALIPSO label mapping cache, this function should be called + * before any of the other functions defined in this file. Returns zero on + * success, negative values on error. + * + */ +static int __init calipso_cache_init(void) +{ + u32 iter; + + calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, + sizeof(struct calipso_map_cache_bkt), + GFP_KERNEL); + if (!calipso_cache) + return -ENOMEM; + + for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { + spin_lock_init(&calipso_cache[iter].lock); + calipso_cache[iter].size = 0; + INIT_LIST_HEAD(&calipso_cache[iter].list); + } + + return 0; +} + +/** + * calipso_cache_invalidate - Invalidates the current CALIPSO cache + * + * Description: + * Invalidates and frees any entries in the CALIPSO cache. Returns zero on + * success and negative values on failure. + * + */ +static void calipso_cache_invalidate(void) +{ + struct calipso_map_cache_entry *entry, *tmp_entry; + u32 iter; + + for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { + spin_lock_bh(&calipso_cache[iter].lock); + list_for_each_entry_safe(entry, + tmp_entry, + &calipso_cache[iter].list, list) { + list_del(&entry->list); + calipso_cache_entry_free(entry); + } + calipso_cache[iter].size = 0; + spin_unlock_bh(&calipso_cache[iter].lock); + } +} + +/** + * calipso_cache_check - Check the CALIPSO cache for a label mapping + * @key: the buffer to check + * @key_len: buffer length in bytes + * @secattr: the security attribute struct to use + * + * Description: + * This function checks the cache to see if a label mapping already exists for + * the given key. If there is a match then the cache is adjusted and the + * @secattr struct is populated with the correct LSM security attributes. The + * cache is adjusted in the following manner if the entry is not already the + * first in the cache bucket: + * + * 1. The cache entry's activity counter is incremented + * 2. The previous (higher ranking) entry's activity counter is decremented + * 3. If the difference between the two activity counters is geater than + * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped + * + * Returns zero on success, -ENOENT for a cache miss, and other negative values + * on error. + * + */ +static int calipso_cache_check(const unsigned char *key, + u32 key_len, + struct netlbl_lsm_secattr *secattr) +{ + u32 bkt; + struct calipso_map_cache_entry *entry; + struct calipso_map_cache_entry *prev_entry = NULL; + u32 hash; + + if (!calipso_cache_enabled) + return -ENOENT; + + hash = calipso_map_cache_hash(key, key_len); + bkt = hash & (CALIPSO_CACHE_BUCKETS - 1); + spin_lock_bh(&calipso_cache[bkt].lock); + list_for_each_entry(entry, &calipso_cache[bkt].list, list) { + if (entry->hash == hash && + entry->key_len == key_len && + memcmp(entry->key, key, key_len) == 0) { + entry->activity += 1; + atomic_inc(&entry->lsm_data->refcount); + secattr->cache = entry->lsm_data; + secattr->flags |= NETLBL_SECATTR_CACHE; + secattr->type = NETLBL_NLTYPE_CALIPSO; + if (!prev_entry) { + spin_unlock_bh(&calipso_cache[bkt].lock); + return 0; + } + + if (prev_entry->activity > 0) + prev_entry->activity -= 1; + if (entry->activity > prev_entry->activity && + entry->activity - prev_entry->activity > + CALIPSO_CACHE_REORDERLIMIT) { + __list_del(entry->list.prev, entry->list.next); + __list_add(&entry->list, + prev_entry->list.prev, + &prev_entry->list); + } + + spin_unlock_bh(&calipso_cache[bkt].lock); + return 0; + } + prev_entry = entry; + } + spin_unlock_bh(&calipso_cache[bkt].lock); + + return -ENOENT; +} + +/** + * calipso_cache_add - Add an entry to the CALIPSO cache + * @calipso_ptr: the CALIPSO option + * @secattr: the packet's security attributes + * + * Description: + * Add a new entry into the CALIPSO label mapping cache. Add the new entry to + * head of the cache bucket's list, if the cache bucket is out of room remove + * the last entry in the list first. It is important to note that there is + * currently no checking for duplicate keys. Returns zero on success, + * negative values on failure. The key stored starts at calipso_ptr + 2, + * i.e. the type and length bytes are not stored, this corresponds to + * calipso_ptr[1] bytes of data. + * + */ +static int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -EPERM; + u32 bkt; + struct calipso_map_cache_entry *entry = NULL; + struct calipso_map_cache_entry *old_entry = NULL; + u32 calipso_ptr_len; + + if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0) + return 0; + + calipso_ptr_len = calipso_ptr[1]; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + return -ENOMEM; + entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); + if (!entry->key) { + ret_val = -ENOMEM; + goto cache_add_failure; + } + entry->key_len = calipso_ptr_len; + entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len); + atomic_inc(&secattr->cache->refcount); + entry->lsm_data = secattr->cache; + + bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1); + spin_lock_bh(&calipso_cache[bkt].lock); + if (calipso_cache[bkt].size < calipso_cache_bucketsize) { + list_add(&entry->list, &calipso_cache[bkt].list); + calipso_cache[bkt].size += 1; + } else { + old_entry = list_entry(calipso_cache[bkt].list.prev, + struct calipso_map_cache_entry, list); + list_del(&old_entry->list); + list_add(&entry->list, &calipso_cache[bkt].list); + calipso_cache_entry_free(old_entry); + } + spin_unlock_bh(&calipso_cache[bkt].lock); + + return 0; + +cache_add_failure: + if (entry) + calipso_cache_entry_free(entry); + return ret_val; +} + +/* DOI List Functions + */ + +/** + * calipso_doi_search - Searches for a DOI definition + * @doi: the DOI to search for + * + * Description: + * Search the DOI definition list for a DOI definition with a DOI value that + * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). + * Returns a pointer to the DOI definition on success and NULL on failure. + */ +static struct calipso_doi *calipso_doi_search(u32 doi) +{ + struct calipso_doi *iter; + + list_for_each_entry_rcu(iter, &calipso_doi_list, list) + if (iter->doi == doi && atomic_read(&iter->refcount)) + return iter; + return NULL; +} + +/** + * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine + * @doi_def: the DOI structure + * @audit_info: NetLabel audit information + * + * Description: + * The caller defines a new DOI for use by the CALIPSO engine and calls this + * function to add it to the list of acceptable domains. The caller must + * ensure that the mapping table specified in @doi_def->map meets all of the + * requirements of the mapping type (see calipso.h for details). Returns + * zero on success and non-zero on failure. + * + */ +static int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ + int ret_val = -EINVAL; + u32 doi; + u32 doi_type; + struct audit_buffer *audit_buf; + + doi = doi_def->doi; + doi_type = doi_def->type; + + if (doi_def->doi == CALIPSO_DOI_UNKNOWN) + goto doi_add_return; + + atomic_set(&doi_def->refcount, 1); + + spin_lock(&calipso_doi_list_lock); + if (calipso_doi_search(doi_def->doi)) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -EEXIST; + goto doi_add_return; + } + list_add_tail_rcu(&doi_def->list, &calipso_doi_list); + spin_unlock(&calipso_doi_list_lock); + ret_val = 0; + +doi_add_return: + audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info); + if (audit_buf) { + const char *type_str; + + switch (doi_type) { + case CALIPSO_MAP_PASS: + type_str = "pass"; + break; + default: + type_str = "(unknown)"; + } + audit_log_format(audit_buf, + " calipso_doi=%u calipso_type=%s res=%u", + doi, type_str, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + +/** + * calipso_doi_free - Frees a DOI definition + * @doi_def: the DOI definition + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +static void calipso_doi_free(struct calipso_doi *doi_def) +{ + kfree(doi_def); +} + +/** + * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that the memory allocated to the DOI definition can be released + * safely. + * + */ +static void calipso_doi_free_rcu(struct rcu_head *entry) +{ + struct calipso_doi *doi_def; + + doi_def = container_of(entry, struct calipso_doi, rcu); + calipso_doi_free(doi_def); +} + +/** + * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine + * @doi: the DOI value + * @audit_secid: the LSM secid to use in the audit message + * + * Description: + * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. + * + */ +static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) +{ + int ret_val; + struct calipso_doi *doi_def; + struct audit_buffer *audit_buf; + + spin_lock(&calipso_doi_list_lock); + doi_def = calipso_doi_search(doi); + if (!doi_def) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -ENOENT; + goto doi_remove_return; + } + if (!atomic_dec_and_test(&doi_def->refcount)) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -EBUSY; + goto doi_remove_return; + } + list_del_rcu(&doi_def->list); + spin_unlock(&calipso_doi_list_lock); + + call_rcu(&doi_def->rcu, calipso_doi_free_rcu); + ret_val = 0; + +doi_remove_return: + audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info); + if (audit_buf) { + audit_log_format(audit_buf, + " calipso_doi=%u res=%u", + doi, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + +/** + * calipso_doi_getdef - Returns a reference to a valid DOI definition + * @doi: the DOI value + * + * Description: + * Searches for a valid DOI definition and if one is found it is returned to + * the caller. Otherwise NULL is returned. The caller must ensure that + * calipso_doi_putdef() is called when the caller is done. + * + */ +static struct calipso_doi *calipso_doi_getdef(u32 doi) +{ + struct calipso_doi *doi_def; + + rcu_read_lock(); + doi_def = calipso_doi_search(doi); + if (!doi_def) + goto doi_getdef_return; + if (!atomic_inc_not_zero(&doi_def->refcount)) + doi_def = NULL; + +doi_getdef_return: + rcu_read_unlock(); + return doi_def; +} + +/** + * calipso_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from calipso_doi_getdef(). + * + */ +static void calipso_doi_putdef(struct calipso_doi *doi_def) +{ + if (!doi_def) + return; + + if (!atomic_dec_and_test(&doi_def->refcount)) + return; + spin_lock(&calipso_doi_list_lock); + list_del_rcu(&doi_def->list); + spin_unlock(&calipso_doi_list_lock); + + call_rcu(&doi_def->rcu, calipso_doi_free_rcu); +} + +/** + * calipso_doi_walk - Iterate through the DOI definitions + * @skip_cnt: skip past this number of DOI definitions, updated + * @callback: callback for each DOI definition + * @cb_arg: argument for the callback function + * + * Description: + * Iterate over the DOI definition list, skipping the first @skip_cnt entries. + * For each entry call @callback, if @callback returns a negative value stop + * 'walking' through the list and return. Updates the value in @skip_cnt upon + * return. Returns zero on success, negative values on failure. + * + */ +static int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, + void *arg), + void *cb_arg) +{ + int ret_val = -ENOENT; + u32 doi_cnt = 0; + struct calipso_doi *iter_doi; + + rcu_read_lock(); + list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list) + if (atomic_read(&iter_doi->refcount) > 0) { + if (doi_cnt++ < *skip_cnt) + continue; + ret_val = callback(iter_doi, cb_arg); + if (ret_val < 0) { + doi_cnt--; + goto doi_walk_return; + } + } + +doi_walk_return: + rcu_read_unlock(); + *skip_cnt = doi_cnt; + return ret_val; +} + +/** + * calipso_validate - Validate a CALIPSO option + * @skb: the packet + * @option: the start of the option + * + * Description: + * This routine is called to validate a CALIPSO option. + * If the option is valid then %true is returned, otherwise + * %false is returned. + * + * The caller should have already checked that the length of the + * option (including the TLV header) is >= 10 and that the catmap + * length is consistent with the option length. + * + * We leave checks on the level and categories to the socket layer. + */ +bool calipso_validate(const struct sk_buff *skb, const unsigned char *option) +{ + struct calipso_doi *doi_def; + bool ret_val; + u16 crc, len = option[1] + 2; + static const u8 zero[2]; + + /* The original CRC runs over the option including the TLV header + * with the CRC-16 field (at offset 8) zeroed out. */ + crc = crc_ccitt(0xffff, option, 8); + crc = crc_ccitt(crc, zero, sizeof(zero)); + if (len > 10) + crc = crc_ccitt(crc, option + 10, len - 10); + crc = ~crc; + if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff)) + return false; + + rcu_read_lock(); + doi_def = calipso_doi_search(get_unaligned_be32(option + 2)); + ret_val = !!doi_def; + rcu_read_unlock(); + + return ret_val; +} + +/** + * calipso_map_cat_hton - Perform a category mapping from host to network + * @doi_def: the DOI definition + * @secattr: the security attributes + * @net_cat: the zero'd out category bitmap in network/CALIPSO format + * @net_cat_len: the length of the CALIPSO bitmap in bytes + * + * Description: + * Perform a label mapping to translate a local MLS category bitmap to the + * correct CALIPSO bitmap using the given DOI definition. Returns the minimum + * size in bytes of the network bitmap on success, negative values otherwise. + * + */ +static int calipso_map_cat_hton(const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *net_cat, + u32 net_cat_len) +{ + int spot = -1; + u32 net_spot_max = 0; + u32 net_clen_bits = net_cat_len * 8; + + for (;;) { + spot = netlbl_catmap_walk(secattr->attr.mls.cat, + spot + 1); + if (spot < 0) + break; + if (spot >= net_clen_bits) + return -ENOSPC; + netlbl_bitmap_setbit(net_cat, spot, 1); + + if (spot > net_spot_max) + net_spot_max = spot; + } + + return (net_spot_max / 32 + 1) * 4; +} + +/** + * calipso_map_cat_ntoh - Perform a category mapping from network to host + * @doi_def: the DOI definition + * @net_cat: the category bitmap in network/CALIPSO format + * @net_cat_len: the length of the CALIPSO bitmap in bytes + * @secattr: the security attributes + * + * Description: + * Perform a label mapping to translate a CALIPSO bitmap to the correct local + * MLS category bitmap using the given DOI definition. Returns zero on + * success, negative values on failure. + * + */ +static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, + const unsigned char *net_cat, + u32 net_cat_len, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + int spot = -1; + u32 net_clen_bits = net_cat_len * 8; + + for (;;) { + spot = netlbl_bitmap_walk(net_cat, + net_clen_bits, + spot + 1, + 1); + if (spot < 0) { + if (spot == -2) + return -EFAULT; + return 0; + } + + ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, + spot, + GFP_ATOMIC); + if (ret_val != 0) + return ret_val; + } + + return -EINVAL; +} + +/** + * calipso_pad_write - Writes pad bytes in TLV format + * @buf: the buffer + * @offset: offset from start of buffer to write padding + * @count: number of pad bytes to write + * + * Description: + * Write @count bytes of TLV padding into @buffer starting at offset @offset. + * @count should be less than 8 - see RFC 4942. + * + */ +static int calipso_pad_write(unsigned char *buf, unsigned int offset, + unsigned int count) +{ + if (WARN_ON_ONCE(count >= 8)) + return -EINVAL; + + switch (count) { + case 0: + break; + case 1: + buf[offset] = IPV6_TLV_PAD1; + break; + default: + buf[offset] = IPV6_TLV_PADN; + buf[offset + 1] = count - 2; + if (count > 2) + memset(buf + offset + 2, 0, count - 2); + break; + } + return 0; +} + +/** + * calipso_genopt - Generate a CALIPSO option + * @buf: the option buffer + * @start: offset from which to write + * @buf_len: the size of opt_buf + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Generate a CALIPSO option using the DOI definition and security attributes + * passed to the function. This also generates upto three bytes of leading + * padding that ensures that the option is 4n + 2 aligned. It returns the + * number of bytes written (including any initial padding). + */ +static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u32 len, pad; + u16 crc; + static const unsigned char padding[4] = {2, 1, 0, 3}; + unsigned char *calipso; + + /* CALIPSO has 4n + 2 alignment */ + pad = padding[start & 3]; + if (buf_len <= start + pad + CALIPSO_HDR_LEN) + return -ENOSPC; + + if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) + return -EPERM; + + len = CALIPSO_HDR_LEN; + + if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { + ret_val = calipso_map_cat_hton(doi_def, + secattr, + buf + start + pad + len, + buf_len - start - pad - len); + if (ret_val < 0) + return ret_val; + len += ret_val; + } + + calipso_pad_write(buf, start, pad); + calipso = buf + start + pad; + + calipso[0] = IPV6_TLV_CALIPSO; + calipso[1] = len - 2; + *(__be32 *)(calipso + 2) = htonl(doi_def->doi); + calipso[6] = (len - CALIPSO_HDR_LEN) / 4; + calipso[7] = secattr->attr.mls.lvl, + crc = ~crc_ccitt(0xffff, calipso, len); + calipso[8] = crc & 0xff; + calipso[9] = (crc >> 8) & 0xff; + return pad + len; +} + +/* Hop-by-hop hdr helper functions + */ + +/** + * calipso_opt_update - Replaces socket's hop options with a new set + * @sk: the socket + * @hop: new hop options + * + * Description: + * Replaces @sk's hop options with @hop. @hop may be NULL to leave + * the socket with no hop options. + * + */ +static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) +{ + struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; + + txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS, + hop, hop ? ipv6_optlen(hop) : 0); + txopt_put(old); + if (IS_ERR(txopts)) + return PTR_ERR(txopts); + + txopts = ipv6_update_options(sk, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + + return 0; +} + +/** + * calipso_tlv_len - Returns the length of the TLV + * @opt: the option header + * @offset: offset of the TLV within the header + * + * Description: + * Returns the length of the TLV option at offset @offset within + * the option header @opt. Checks that the entire TLV fits inside + * the option header, returns a negative value if this is not the case. + */ +static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset) +{ + unsigned char *tlv = (unsigned char *)opt; + unsigned int opt_len = ipv6_optlen(opt), tlv_len; + + if (offset < sizeof(*opt) || offset >= opt_len) + return -EINVAL; + if (tlv[offset] == IPV6_TLV_PAD1) + return 1; + if (offset + 1 >= opt_len) + return -EINVAL; + tlv_len = tlv[offset + 1] + 2; + if (offset + tlv_len > opt_len) + return -EINVAL; + return tlv_len; +} + +/** + * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header + * @hop: the hop options header + * @start: on return holds the offset of any leading padding + * @end: on return holds the offset of the first non-pad TLV after CALIPSO + * + * Description: + * Finds the space occupied by a CALIPSO option (including any leading and + * trailing padding). + * + * If a CALIPSO option exists set @start and @end to the + * offsets within @hop of the start of padding before the first + * CALIPSO option and the end of padding after the first CALIPSO + * option. In this case the function returns 0. + * + * In the absence of a CALIPSO option, @start and @end will be + * set to the start and end of any trailing padding in the header. + * This is useful when appending a new option, as the caller may want + * to overwrite some of this padding. In this case the function will + * return -ENOENT. + */ +static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start, + unsigned int *end) +{ + int ret_val = -ENOENT, tlv_len; + unsigned int opt_len, offset, offset_s = 0, offset_e = 0; + unsigned char *opt = (unsigned char *)hop; + + opt_len = ipv6_optlen(hop); + offset = sizeof(*hop); + + while (offset < opt_len) { + tlv_len = calipso_tlv_len(hop, offset); + if (tlv_len < 0) + return tlv_len; + + switch (opt[offset]) { + case IPV6_TLV_PAD1: + case IPV6_TLV_PADN: + if (offset_e) + offset_e = offset; + break; + case IPV6_TLV_CALIPSO: + ret_val = 0; + offset_e = offset; + break; + default: + if (offset_e == 0) + offset_s = offset; + else + goto out; + } + offset += tlv_len; + } + +out: + if (offset_s) + *start = offset_s + calipso_tlv_len(hop, offset_s); + else + *start = sizeof(*hop); + if (offset_e) + *end = offset_e + calipso_tlv_len(hop, offset_e); + else + *end = opt_len; + + return ret_val; +} + +/** + * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr + * @hop: the original hop options header + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Creates a new hop options header based on @hop with a + * CALIPSO option added to it. If @hop already contains a CALIPSO + * option this is overwritten, otherwise the new option is appended + * after any existing options. If @hop is NULL then the new header + * will contain just the CALIPSO option and any needed padding. + * + */ +static struct ipv6_opt_hdr * +calipso_opt_insert(struct ipv6_opt_hdr *hop, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + unsigned int start, end, buf_len, pad, hop_len; + struct ipv6_opt_hdr *new; + int ret_val; + + if (hop) { + hop_len = ipv6_optlen(hop); + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val && ret_val != -ENOENT) + return ERR_PTR(ret_val); + } else { + hop_len = 0; + start = sizeof(*hop); + end = 0; + } + + buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD; + new = kzalloc(buf_len, GFP_ATOMIC); + if (!new) + return ERR_PTR(-ENOMEM); + + if (start > sizeof(*hop)) + memcpy(new, hop, start); + ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def, + secattr); + if (ret_val < 0) { + kfree(new); + return ERR_PTR(ret_val); + } + + buf_len = start + ret_val; + /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */ + pad = ((buf_len & 4) + (end & 7)) & 7; + calipso_pad_write((unsigned char *)new, buf_len, pad); + buf_len += pad; + + if (end != hop_len) { + memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end); + buf_len += hop_len - end; + } + new->nexthdr = 0; + new->hdrlen = buf_len / 8 - 1; + + return new; +} + +/** + * calipso_opt_del - Removes the CALIPSO option from an option header + * @hop: the original header + * @new: the new header + * + * Description: + * Creates a new header based on @hop without any CALIPSO option. If @hop + * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains + * no other non-padding options, it returns zero with @new set to NULL. + * Otherwise it returns zero, creates a new header without the CALIPSO + * option (and removing as much padding as possible) and returns with + * @new set to that header. + * + */ +static int calipso_opt_del(struct ipv6_opt_hdr *hop, + struct ipv6_opt_hdr **new) +{ + int ret_val; + unsigned int start, end, delta, pad, hop_len; + + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val) + return ret_val; + + hop_len = ipv6_optlen(hop); + if (start == sizeof(*hop) && end == hop_len) { + /* There's no other option in the header so return NULL */ + *new = NULL; + return 0; + } + + delta = (end - start) & ~7; + *new = kzalloc(hop_len - delta, GFP_ATOMIC); + if (!*new) + return -ENOMEM; + + memcpy(*new, hop, start); + (*new)->hdrlen -= delta / 8; + pad = (end - start) & 7; + calipso_pad_write((unsigned char *)*new, start, pad); + if (end != hop_len) + memcpy((char *)*new + start + pad, (char *)hop + end, + hop_len - end); + + return 0; +} + +/** + * calipso_opt_getattr - Get the security attributes from a memory block + * @calipso: the CALIPSO option + * @secattr: the security attributes + * + * Description: + * Inspect @calipso and return the security attributes in @secattr. + * Returns zero on success and negative values on failure. + * + */ +static int calipso_opt_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + u32 doi, len = calipso[1], cat_len = calipso[6] * 4; + struct calipso_doi *doi_def; + + if (cat_len + 8 > len) + return -EINVAL; + + if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0) + return 0; + + doi = get_unaligned_be32(calipso + 2); + rcu_read_lock(); + doi_def = calipso_doi_search(doi); + if (!doi_def) + goto getattr_return; + + secattr->attr.mls.lvl = calipso[7]; + secattr->flags |= NETLBL_SECATTR_MLS_LVL; + + if (cat_len) { + ret_val = calipso_map_cat_ntoh(doi_def, + calipso + 10, + cat_len, + secattr); + if (ret_val != 0) { + netlbl_catmap_free(secattr->attr.mls.cat); + goto getattr_return; + } + + secattr->flags |= NETLBL_SECATTR_MLS_CAT; + } + + secattr->type = NETLBL_NLTYPE_CALIPSO; + +getattr_return: + rcu_read_unlock(); + return ret_val; +} + +/* sock functions. + */ + +/** + * calipso_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CALIPSO option attached to the sock and if + * there is return the CALIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +static int calipso_sock_getattr(struct sock *sk, + struct netlbl_lsm_secattr *secattr) +{ + struct ipv6_opt_hdr *hop; + int opt_len, len, ret_val = -ENOMSG, offset; + unsigned char *opt; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + if (!txopts || !txopts->hopopt) + goto done; + + hop = txopts->hopopt; + opt = (unsigned char *)hop; + opt_len = ipv6_optlen(hop); + offset = sizeof(*hop); + while (offset < opt_len) { + len = calipso_tlv_len(hop, offset); + if (len < 0) { + ret_val = len; + goto done; + } + switch (opt[offset]) { + case IPV6_TLV_CALIPSO: + if (len < CALIPSO_HDR_LEN) + ret_val = -EINVAL; + else + ret_val = calipso_opt_getattr(&opt[offset], + secattr); + goto done; + default: + offset += len; + break; + } + } +done: + txopt_put(txopts); + return ret_val; +} + +/** + * calipso_sock_setattr - Add a CALIPSO option to a socket + * @sk: the socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +static int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct ipv6_opt_hdr *old, *new; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + old = NULL; + if (txopts) + old = txopts->hopopt; + + new = calipso_opt_insert(old, doi_def, secattr); + txopt_put(txopts); + if (IS_ERR(new)) + return PTR_ERR(new); + + ret_val = calipso_opt_update(sk, new); + + kfree(new); + return ret_val; +} + +/** + * calipso_sock_delattr - Delete the CALIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CALIPSO option from a socket, if present. + * + */ +static void calipso_sock_delattr(struct sock *sk) +{ + struct ipv6_opt_hdr *new_hop; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + if (!txopts || !txopts->hopopt) + goto done; + + if (calipso_opt_del(txopts->hopopt, &new_hop)) + goto done; + + calipso_opt_update(sk, new_hop); + kfree(new_hop); + +done: + txopt_put(txopts); +} + +/* request sock functions. + */ + +/** + * calipso_req_setattr - Add a CALIPSO option to a connection request socket + * @req: the connection request socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. Returns zero on success and + * negative values on failure. + * + */ +static int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + struct ipv6_txoptions *txopts; + struct inet_request_sock *req_inet = inet_rsk(req); + struct ipv6_opt_hdr *old, *new; + struct sock *sk = sk_to_full_sk(req_to_sk(req)); + + if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) + old = req_inet->ipv6_opt->hopopt; + else + old = NULL; + + new = calipso_opt_insert(old, doi_def, secattr); + if (IS_ERR(new)) + return PTR_ERR(new); + + txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, + new, new ? ipv6_optlen(new) : 0); + + kfree(new); + + if (IS_ERR(txopts)) + return PTR_ERR(txopts); + + txopts = xchg(&req_inet->ipv6_opt, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + + return 0; +} + +/** + * calipso_req_delattr - Delete the CALIPSO option from a request socket + * @reg: the request socket + * + * Description: + * Removes the CALIPSO option from a request socket, if present. + * + */ +static void calipso_req_delattr(struct request_sock *req) +{ + struct inet_request_sock *req_inet = inet_rsk(req); + struct ipv6_opt_hdr *new; + struct ipv6_txoptions *txopts; + struct sock *sk = sk_to_full_sk(req_to_sk(req)); + + if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) + return; + + if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) + return; /* Nothing to do */ + + txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, + new, new ? ipv6_optlen(new) : 0); + + if (!IS_ERR(txopts)) { + txopts = xchg(&req_inet->ipv6_opt, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + } + kfree(new); +} + +/* skbuff functions. + */ + +/** + * calipso_skbuff_optptr - Find the CALIPSO option in the packet + * @skb: the packet + * + * Description: + * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer + * to the start of the CALIPSO option on success, NULL if one if not found. + * + */ +static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb); + int offset; + + if (ip6_hdr->nexthdr != NEXTHDR_HOP) + return NULL; + + offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO); + if (offset >= 0) + return (unsigned char *)ip6_hdr + offset; + + return NULL; +} + +/** + * calipso_skbuff_setattr - Set the CALIPSO option on a packet + * @skb: the packet + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Set the CALIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +static int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct ipv6hdr *ip6_hdr; + struct ipv6_opt_hdr *hop; + unsigned char buf[CALIPSO_MAX_BUFFER]; + int len_delta, new_end, pad; + unsigned int start, end; + + ip6_hdr = ipv6_hdr(skb); + if (ip6_hdr->nexthdr == NEXTHDR_HOP) { + hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val && ret_val != -ENOENT) + return ret_val; + } else { + start = 0; + end = 0; + } + + memset(buf, 0, sizeof(buf)); + ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr); + if (ret_val < 0) + return ret_val; + + new_end = start + ret_val; + /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ + pad = ((new_end & 4) + (end & 7)) & 7; + len_delta = new_end - (int)end + pad; + ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + if (ret_val < 0) + return ret_val; + + if (len_delta) { + if (len_delta > 0) + skb_push(skb, len_delta); + else + skb_pull(skb, -len_delta); + memmove((char *)ip6_hdr - len_delta, ip6_hdr, + sizeof(*ip6_hdr) + start); + skb_reset_network_header(skb); + ip6_hdr = ipv6_hdr(skb); + } + + hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + if (start == 0) { + struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf; + + new_hop->nexthdr = ip6_hdr->nexthdr; + new_hop->hdrlen = len_delta / 8 - 1; + ip6_hdr->nexthdr = NEXTHDR_HOP; + } else { + hop->hdrlen += len_delta / 8; + } + memcpy((char *)hop + start, buf + (start & 3), new_end - start); + calipso_pad_write((unsigned char *)hop, new_end, pad); + + return 0; +} + +/** + * calipso_skbuff_delattr - Delete any CALIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CALIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +static int calipso_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val; + struct ipv6hdr *ip6_hdr; + struct ipv6_opt_hdr *old_hop; + u32 old_hop_len, start = 0, end = 0, delta, size, pad; + + if (!calipso_skbuff_optptr(skb)) + return 0; + + /* since we are changing the packet we should make a copy */ + ret_val = skb_cow(skb, skb_headroom(skb)); + if (ret_val < 0) + return ret_val; + + ip6_hdr = ipv6_hdr(skb); + old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + old_hop_len = ipv6_optlen(old_hop); + + ret_val = calipso_opt_find(old_hop, &start, &end); + if (ret_val) + return ret_val; + + if (start == sizeof(*old_hop) && end == old_hop_len) { + /* There's no other option in the header so we delete + * the whole thing. */ + delta = old_hop_len; + size = sizeof(*ip6_hdr); + ip6_hdr->nexthdr = old_hop->nexthdr; + } else { + delta = (end - start) & ~7; + if (delta) + old_hop->hdrlen -= delta / 8; + pad = (end - start) & 7; + size = sizeof(*ip6_hdr) + start + pad; + calipso_pad_write((unsigned char *)old_hop, start, pad); + } + + if (delta) { + skb_pull(skb, delta); + memmove((char *)ip6_hdr + delta, ip6_hdr, size); + skb_reset_network_header(skb); + } + + return 0; +} + +static const struct netlbl_calipso_ops ops = { + .doi_add = calipso_doi_add, + .doi_free = calipso_doi_free, + .doi_remove = calipso_doi_remove, + .doi_getdef = calipso_doi_getdef, + .doi_putdef = calipso_doi_putdef, + .doi_walk = calipso_doi_walk, + .sock_getattr = calipso_sock_getattr, + .sock_setattr = calipso_sock_setattr, + .sock_delattr = calipso_sock_delattr, + .req_setattr = calipso_req_setattr, + .req_delattr = calipso_req_delattr, + .opt_getattr = calipso_opt_getattr, + .skbuff_optptr = calipso_skbuff_optptr, + .skbuff_setattr = calipso_skbuff_setattr, + .skbuff_delattr = calipso_skbuff_delattr, + .cache_invalidate = calipso_cache_invalidate, + .cache_add = calipso_cache_add +}; + +/** + * calipso_init - Initialize the CALIPSO module + * + * Description: + * Initialize the CALIPSO module and prepare it for use. Returns zero on + * success and negative values on failure. + * + */ +int __init calipso_init(void) +{ + int ret_val; + + ret_val = calipso_cache_init(); + if (!ret_val) + netlbl_calipso_ops_register(&ops); + return ret_val; +} + +void calipso_exit(void) +{ + netlbl_calipso_ops_register(NULL); + calipso_cache_invalidate(); + kfree(calipso_cache); +} diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 8de5dd7aa..139ceb68b 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -43,6 +43,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6_MIP6) #include #endif @@ -603,6 +604,28 @@ drop: return false; } +/* CALIPSO RFC 5570 */ + +static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff) +{ + const unsigned char *nh = skb_network_header(skb); + + if (nh[optoff + 1] < 8) + goto drop; + + if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1]) + goto drop; + + if (!calipso_validate(skb, nh + optoff)) + goto drop; + + return true; + +drop: + kfree_skb(skb); + return false; +} + static const struct tlvtype_proc tlvprochopopt_lst[] = { { .type = IPV6_TLV_ROUTERALERT, @@ -612,6 +635,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = { .type = IPV6_TLV_JUMBO, .func = ipv6_hop_jumbo, }, + { + .type = IPV6_TLV_CALIPSO, + .func = ipv6_hop_calipso, + }, { -1, } }; @@ -758,6 +785,27 @@ static int ipv6_renew_option(void *ohdr, return 0; } +/** + * ipv6_renew_options - replace a specific ext hdr with a new one. + * + * @sk: sock from which to allocate memory + * @opt: original options + * @newtype: option type to replace in @opt + * @newopt: new option of type @newtype to replace (user-mem) + * @newoptlen: length of @newopt + * + * Returns a new set of options which is a copy of @opt with the + * option type @newtype replaced with @newopt. + * + * @opt may be NULL, in which case a new set of options is returned + * containing just @newopt. + * + * @newopt may be NULL, in which case the specified option type is + * not copied into the new set of options. + * + * The new set of options is allocated from the socket option memory + * buffer of @sk. + */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, @@ -830,6 +878,34 @@ out: return ERR_PTR(err); } +/** + * ipv6_renew_options_kern - replace a specific ext hdr with a new one. + * + * @sk: sock from which to allocate memory + * @opt: original options + * @newtype: option type to replace in @opt + * @newopt: new option of type @newtype to replace (kernel-mem) + * @newoptlen: length of @newopt + * + * See ipv6_renew_options(). The difference is that @newopt is + * kernel memory, rather than user memory. + */ +struct ipv6_txoptions * +ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt, + int newtype, struct ipv6_opt_hdr *newopt, + int newoptlen) +{ + struct ipv6_txoptions *ret_val; + const mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret_val = ipv6_renew_options(sk, opt, newtype, + (struct ipv6_opt_hdr __user *)newopt, + newoptlen); + set_fs(old_fs); + return ret_val; +} + struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 9508a20fb..305e2ed73 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -112,7 +112,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, } EXPORT_SYMBOL(ipv6_skip_exthdr); -int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) +int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index ed33abf57..5857c1fc8 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -67,6 +67,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; int err = 0; + u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: @@ -86,7 +87,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto discard_pkt; } - table = fib6_get_table(net, rule->table); + tb_id = fib_rule_get_table(rule, arg); + table = fib6_get_table(net, tb_id); if (!table) { err = -EAGAIN; goto out; @@ -199,7 +201,7 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct net *net = sock_net(skb->sk); struct fib6_rule *rule6 = (struct fib6_rule *) rule; - if (rule->action == FR_ACT_TO_TBL) { + if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) goto errout; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a4fa84076..bd59c343d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -388,7 +388,8 @@ relookup_failed: /* * Send an ICMP message in response to a packet in error */ -static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct in6_addr *force_saddr) { struct net *net = dev_net(skb->dev); struct inet6_dev *idev = NULL; @@ -475,6 +476,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; + if (force_saddr) + saddr = force_saddr; if (saddr) fl6.saddr = *saddr; fl6.flowi6_mark = mark; @@ -502,12 +505,14 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; + ipc6.tclass = np->tclass; + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.tclass = np->tclass; ipc6.dontfrag = np->dontfrag; ipc6.opt = NULL; @@ -549,10 +554,75 @@ out: */ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { - icmp6_send(skb, ICMPV6_PARAMPROB, code, pos); + icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL); kfree_skb(skb); } +/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH + * if sufficient data bytes are available + * @nhs is the size of the tunnel header(s) : + * Either an IPv4 header for SIT encap + * an IPv4 header + GRE header for GRE encap + */ +int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, + unsigned int data_len) +{ + struct in6_addr temp_saddr; + struct rt6_info *rt; + struct sk_buff *skb2; + u32 info = 0; + + if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8)) + return 1; + + /* RFC 4884 (partial) support for ICMP extensions */ + if (data_len < 128 || (data_len & 7) || skb->len < data_len) + data_len = 0; + + skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC); + + if (!skb2) + return 1; + + skb_dst_drop(skb2); + skb_pull(skb2, nhs); + skb_reset_network_header(skb2); + + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); + + if (rt && rt->dst.dev) + skb2->dev = rt->dst.dev; + + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr); + + if (data_len) { + /* RFC 4884 (partial) support : + * insert 0 padding at the end, before the extensions + */ + __skb_push(skb2, nhs); + skb_reset_network_header(skb2); + memmove(skb2->data, skb2->data + nhs, data_len - nhs); + memset(skb2->data + data_len - nhs, 0, nhs); + /* RFC 4884 4.5 : Length is measured in 64-bit words, + * and stored in reserved[0] + */ + info = (data_len/8) << 24; + } + if (type == ICMP_TIME_EXCEEDED) + icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + info, &temp_saddr); + else + icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, + info, &temp_saddr); + if (rt) + ip6_rt_put(rt); + + kfree_skb(skb2); + + return 0; +} +EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); + static void icmpv6_echo_reply(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); @@ -585,7 +655,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; - fl6.flowi6_oif = l3mdev_fib_oif(skb->dev); + fl6.flowi6_oif = skb->dev->ifindex; fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index d08fd2d48..e0170f62b 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -109,7 +109,8 @@ static inline bool ila_csum_neutral_set(struct ila_identifier ident) return !!(ident.csum_neutral); } -void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p); +void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, + bool set_csum_neutral); void ila_init_saved_csum(struct ila_params *p); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 0e94042d1..ec9efbcda 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -34,12 +34,12 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr, if (p->locator_match.v64) { diff = p->csum_diff; } else { - diff = compute_csum_diff8((__be32 *)iaddr, - (__be32 *)&p->locator); + diff = compute_csum_diff8((__be32 *)&p->locator, + (__be32 *)iaddr); } fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? - ~CSUM_NEUTRAL_FLAG : CSUM_NEUTRAL_FLAG); + CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG); diff = csum_add(diff, fval); @@ -103,7 +103,8 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, iaddr->loc = p->locator; } -void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, + bool set_csum_neutral) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); @@ -114,7 +115,8 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) * is a locator being translated to a SIR address. * Perform (receiver) checksum-neutral translation. */ - ila_csum_do_neutral(iaddr, p); + if (!set_csum_neutral) + ila_csum_do_neutral(iaddr, p); } else { switch (p->csum_mode) { case ILA_CSUM_ADJUST_TRANSPORT: @@ -138,8 +140,8 @@ void ila_init_saved_csum(struct ila_params *p) return; p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator_match, - (__be32 *)&p->locator); + (__be32 *)&p->locator, + (__be32 *)&p->locator_match); } static int __init ila_init(void) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 1dfb64166..c8314c6b6 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -26,7 +26,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), true); return dst->lwtstate->orig_output(net, sk, skb); @@ -42,7 +42,7 @@ static int ila_input(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false); return dst->lwtstate->orig_input(skb); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index a90e57229..e6eca5fdf 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -210,14 +210,14 @@ static void ila_free_cb(void *ptr, void *arg) } } -static int ila_xlat_addr(struct sk_buff *skb); +static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - ila_xlat_addr(skb); + ila_xlat_addr(skb, false); return NF_ACCEPT; } @@ -597,7 +597,7 @@ static struct pernet_operations ila_net_ops = { .size = sizeof(struct ila_net), }; -static int ila_xlat_addr(struct sk_buff *skb) +static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -616,7 +616,7 @@ static int ila_xlat_addr(struct sk_buff *skb) ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) - ila_update_ipv6_locator(skb, &ila->xp.ip); + ila_update_ipv6_locator(skb, &ila->xp.ip, set_csum_neutral); rcu_read_unlock(); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 776d14511..edc3daab3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -519,8 +519,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); - skb_set_inner_protocol(skb, protocol); - return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); } @@ -650,7 +648,6 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = skb->protocol; err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 14dacc544..713676f14 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -39,7 +39,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) if (!send) goto out; - send(skb, type, code, info); + send(skb, type, code, info, NULL); out: rcu_read_unlock(); } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 94611e450..aacfb4bce 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -323,6 +323,7 @@ int ip6_input(struct sk_buff *skb) dev_net(skb->dev), NULL, skb, skb->dev, NULL, ip6_input_finish); } +EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 635b8d340..1dfc402d9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -368,7 +368,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; @@ -910,6 +910,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, int err; int flags = 0; + if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif && + (!*dst || !(*dst)->error)) { + err = l3mdev_get_saddr6(net, sk, fl6); + if (err) + goto out_err; + } + /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the @@ -999,10 +1006,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, return 0; out_err_release: - if (err == -ENETUNREACH) - IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); dst_release(*dst); *dst = NULL; +out_err: + if (err == -ENETUNREACH) + IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index d90a11f14..5bd3afdcc 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb) goto discard; } - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - rcu_read_unlock(); - return xfrm6_rcv(skb); + return xfrm6_rcv_tnl(skb, t); } rcu_read_unlock(); return -EINVAL; @@ -340,6 +338,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; + struct xfrm_mode *inner_mode; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; u32 orig_mark = skb->mark; int ret; @@ -357,7 +356,19 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) } x = xfrm_input_state(skb); - family = x->inner_mode->afinfo->family; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + family = inner_mode->afinfo->family; skb->mark = be32_to_cpu(t->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 487ef3bc7..7f4265b16 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -921,6 +921,7 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca cache->mfc_un.res.maxvif = vifi + 1; } } + cache->mfc_un.res.lastuse = jiffies; } static int mif6_add(struct net *net, struct mr6_table *mrt, @@ -1592,14 +1593,15 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) if (likely(mrt->mroute6_sk == NULL)) { mrt->mroute6_sk = sk; net->ipv6.devconf_all->mc_forwarding++; - inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, - NETCONFA_IFINDEX_ALL, - net->ipv6.devconf_all); - } - else + } else { err = -EADDRINUSE; + } write_unlock_bh(&mrt_lock); + if (!err) + inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); rtnl_unlock(); return err; @@ -1617,11 +1619,11 @@ int ip6mr_sk_done(struct sock *sk) write_lock_bh(&mrt_lock); mrt->mroute6_sk = NULL; net->ipv6.devconf_all->mc_forwarding--; + write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - write_unlock_bh(&mrt_lock); mroute_clean_tables(mrt, false); err = 0; @@ -2091,6 +2093,7 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, vif = cache->mf6c_parent; cache->mfc_un.res.pkt++; cache->mfc_un.res.bytes += skb->len; + cache->mfc_un.res.lastuse = jiffies; if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; @@ -2233,10 +2236,11 @@ int ip6_mr_input(struct sk_buff *skb) static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm) { - int ct; - struct rtnexthop *nhp; - struct nlattr *mp_attr; struct rta_mfc_stats mfcs; + struct nlattr *mp_attr; + struct rtnexthop *nhp; + unsigned long lastuse; + int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mf6c_parent >= MAXMIFS) @@ -2266,18 +2270,23 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0) + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), + RTA_PAD)) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; return 1; } -int ip6mr_get_route(struct net *net, - struct sk_buff *skb, struct rtmsg *rtm, int nowait) +int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, + int nowait, u32 portid) { int err; struct mr6_table *mrt; @@ -2322,6 +2331,7 @@ int ip6mr_get_route(struct net *net, return -ENOMEM; } + NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a9895e15e..5330262ab 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -98,7 +98,6 @@ int ip6_ra_control(struct sock *sk, int sel) return 0; } -static struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index c245895a3..fe65cdc28 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -73,15 +73,6 @@ #include #include -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(val, level, fmt, ...) \ -do { \ - if (val <= ND_DEBUG) \ - net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ -} while (0) - static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); @@ -150,11 +141,10 @@ struct neigh_table nd_tbl = { }; EXPORT_SYMBOL_GPL(nd_tbl); -static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, + int data_len, int pad) { - int pad = ndisc_addr_option_pad(skb->dev->type); - int data_len = skb->dev->addr_len; - int space = ndisc_opt_addr_space(skb->dev); + int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; @@ -171,6 +161,23 @@ static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) if (space > 0) memset(opt, 0, space); } +EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); + +static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, + void *data, u8 icmp6_type) +{ + __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, + ndisc_addr_option_pad(skb->dev->type)); + ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); +} + +static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, + void *ha, + const u8 *ops_data) +{ + ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); + ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); +} static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) @@ -185,24 +192,28 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, return cur <= end && cur->nd_opt_type == type ? cur : NULL; } -static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) +static inline int ndisc_is_useropt(const struct net_device *dev, + struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_RDNSS || - opt->nd_opt_type == ND_OPT_DNSSL; + opt->nd_opt_type == ND_OPT_DNSSL || + ndisc_ops_is_useropt(dev, opt->nd_opt_type); } -static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, +static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, + struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); - } while (cur < end && !ndisc_is_useropt(cur)); - return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; + } while (cur < end && !ndisc_is_useropt(dev, cur)); + return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, + u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; @@ -217,6 +228,8 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; + if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) + goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: @@ -243,7 +256,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, break; #endif default: - if (ndisc_is_useropt(nd_opt)) { + if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; @@ -260,6 +273,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, nd_opt->nd_opt_len); } } +next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } @@ -509,7 +523,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, if (!dev->addr_len) inc_opt = 0; if (inc_opt) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, + NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -528,8 +543,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, - dev->dev_addr); - + dev->dev_addr, + NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } @@ -574,7 +589,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, + NDISC_NEIGHBOUR_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -590,7 +606,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, - dev->dev_addr); + dev->dev_addr, + NDISC_NEIGHBOUR_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } @@ -626,7 +643,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, } #endif if (send_sllao) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -641,7 +658,8 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, - dev->dev_addr); + dev->dev_addr, + NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } @@ -702,6 +720,15 @@ static int pndisc_is_router(const void *pkey, return ret; } +void ndisc_update(const struct net_device *dev, struct neighbour *neigh, + const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, + struct ndisc_options *ndopts) +{ + neigh_update(neigh, lladdr, new, flags); + /* report ndisc ops about neighbour update */ + ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); +} + static void ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); @@ -738,7 +765,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) return; } - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND options\n"); return; } @@ -856,9 +883,10 @@ have_ifp: neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| - NEIGH_UPDATE_F_OVERRIDE); + NEIGH_UPDATE_F_OVERRIDE, + NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); @@ -911,7 +939,7 @@ static void ndisc_recv_na(struct sk_buff *skb) idev->cnf.drop_unsolicited_na) return; - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; } @@ -967,12 +995,13 @@ static void ndisc_recv_na(struct sk_buff *skb) goto out; } - neigh_update(neigh, lladdr, + ndisc_update(dev, neigh, lladdr, msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), + NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* @@ -1017,7 +1046,7 @@ static void ndisc_recv_rs(struct sk_buff *skb) goto out; /* Parse ND options */ - if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); goto out; } @@ -1031,10 +1060,11 @@ static void ndisc_recv_rs(struct sk_buff *skb) neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| - NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + NEIGH_UPDATE_F_OVERRIDE_ISROUTER, + NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); } out: @@ -1135,7 +1165,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - if (!ndisc_parse_options(opt, optlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { ND_PRINTK(2, warn, "RA: invalid ND options\n"); return; } @@ -1329,11 +1359,12 @@ skip_linkparms: goto out; } } - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - NEIGH_UPDATE_F_ISROUTER); + NEIGH_UPDATE_F_ISROUTER, + NDISC_ROUTER_ADVERTISEMENT, &ndopts); } if (!ipv6_accept_ra(in6_dev)) { @@ -1421,7 +1452,8 @@ skip_routeinfo: struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; - p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) { + p = ndisc_next_useropt(skb->dev, p, + ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } @@ -1459,7 +1491,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; } - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) + if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return; if (!ndopts.nd_opts_rh) { @@ -1504,7 +1536,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) struct dst_entry *dst; struct flowi6 fl6; int rd_len; - u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, + ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; int oif = l3mdev_fib_oif(dev); bool ret; @@ -1563,7 +1596,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_redirect_opt_addr_space(dev, neigh, + ops_data_buf, + &ops_data); } else read_unlock_bh(&neigh->lock); @@ -1594,7 +1629,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) */ if (ha) - ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha); + ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 63e06c3dd..552fac2f3 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -73,22 +73,22 @@ ip6_packet_match(const struct sk_buff *skb, unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); -#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg))) - - if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, - &ip6info->src), IP6T_INV_SRCIP) || - FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, - &ip6info->dst), IP6T_INV_DSTIP)) + if (NF_INVF(ip6info, IP6T_INV_SRCIP, + ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, + &ip6info->src)) || + NF_INVF(ip6info, IP6T_INV_DSTIP, + ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, + &ip6info->dst))) return false; ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_IN)) + if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) + if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0)) return false; /* ... might want to do something with class and flowlabel here ... */ @@ -402,23 +402,12 @@ ip6t_do_table(struct sk_buff *skb, else return verdict; } -static bool find_jump_target(const struct xt_table_info *t, - const struct ip6t_entry *target) -{ - struct ip6t_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -487,10 +476,11 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct ip6t_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -724,6 +714,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ip6t_replace *repl) { struct ip6t_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -736,6 +727,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -745,15 +739,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - return ret; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ip6t_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } + ret = -EINVAL; if (i != repl->num_entries) - return -EINVAL; + goto out_free; /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { @@ -761,13 +758,16 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; if (newinfo->underflow[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -787,6 +787,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index cb2b28883..2b1a9dcdb 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -83,10 +83,6 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, { if (state->hook == NF_INET_LOCAL_OUT) return ip6t_mangle_out(skb, state); - if (state->hook == NF_INET_POST_ROUTING) - return ip6t_do_table(skb, state, - state->net->ipv6.ip6table_mangle); - /* INPUT/FORWARD */ return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 71d995ff3..2535223ba 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv, struct in6_addr saddr, daddr; u_int8_t hop_limit; u32 mark, flowlabel; + int err; /* malformed packet, drop it */ if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) @@ -46,13 +47,16 @@ static unsigned int nf_route_table_hook(void *priv, flowlabel = *((u32 *)ipv6_hdr(skb)); ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_QUEUE && + if (ret != NF_DROP && ret != NF_STOLEN && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || - flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) - return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP; + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { + err = ip6_route_me_harder(state->net, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } return ret; } diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 533cd5719..92bda9908 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -47,6 +47,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = { .eval = nft_reject_ipv6_eval, .init = nft_reject_init, .dump = nft_reject_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 408660477..0e983b694 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -55,7 +55,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct icmp6hdr user_icmph; int addr_type; struct in6_addr *daddr; - int iif = 0; + int oif = 0; struct flowi6 fl6; int err; struct dst_entry *dst; @@ -78,25 +78,30 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (u->sin6_family != AF_INET6) { return -EAFNOSUPPORT; } - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != u->sin6_scope_id) { - return -EINVAL; - } daddr = &(u->sin6_addr); - iif = u->sin6_scope_id; + if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) + oif = u->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; } - if (!iif) - iif = sk->sk_bound_dev_if; + if (!oif) + oif = sk->sk_bound_dev_if; + + if (!oif) + oif = np->sticky_pktinfo.ipi6_ifindex; + + if (!oif && ipv6_addr_is_multicast(daddr)) + oif = np->mcast_oif; + else if (!oif) + oif = np->ucast_oif; addr_type = ipv6_addr_type(daddr); - if (__ipv6_addr_needs_scope_id(addr_type) && !iif) - return -EINVAL; - if (addr_type & IPV6_ADDR_MAPPED) + if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || + (addr_type & IPV6_ADDR_MAPPED) || + (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if)) return -EINVAL; /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ @@ -106,15 +111,14 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.saddr = np->saddr; fl6.daddr = *daddr; + fl6.flowi6_oif = oif; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + ipc6.tclass = np->tclass; + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); if (IS_ERR(dst)) @@ -142,7 +146,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.family = AF_INET6; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.tclass = np->tclass; ipc6.dontfrag = np->dontfrag; ipc6.opt = NULL; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 896350df6..590dd1f77 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -878,6 +878,11 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (inet->hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; + + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -886,9 +891,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - if (ipc6.dontfrag < 0) ipc6.dontfrag = np->dontfrag; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 520b7884d..269218aac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1042,8 +1042,8 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return pcpu_rt; } -static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, - struct flowi6 *fl6, int flags) +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt; @@ -1139,6 +1139,7 @@ redo_rt6_select: } } +EXPORT_SYMBOL_GPL(ip6_pol_route); static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) @@ -1985,9 +1986,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - if (cfg->fc_table) + if (cfg->fc_table) { grt = ip6_nh_lookup_table(net, cfg, gw_addr); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + if (!grt) grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); @@ -2200,7 +2210,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * first-hop router for the specified ICMP Destination Address. */ - if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } @@ -2235,12 +2245,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * We have finally decided to accept it. */ - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - NEIGH_UPDATE_F_ISROUTER)) - ); + NEIGH_UPDATE_F_ISROUTER)), + NDISC_REDIRECT, &ndopts); nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) @@ -2585,23 +2595,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, return rt; } -int ip6_route_get_saddr(struct net *net, - struct rt6_info *rt, - const struct in6_addr *daddr, - unsigned int prefs, - struct in6_addr *saddr) -{ - struct inet6_dev *idev = - rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; - int err = 0; - if (rt && rt->rt6i_prefsrc.plen) - *saddr = rt->rt6i_prefsrc.addr; - else - err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - daddr, prefs, saddr); - return err; -} - /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { struct net_device *dev; @@ -3209,7 +3202,9 @@ static int rt6_fill_node(struct net *net, if (iif) { #ifdef CONFIG_IPV6_MROUTE if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { - int err = ip6mr_get_route(net, skb, rtm, nowait); + int err = ip6mr_get_route(net, skb, rtm, nowait, + portid); + if (err <= 0) { if (!nowait) { if (err == 0) @@ -3306,6 +3301,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) err = -EINVAL; memset(&fl6, 0, sizeof(fl6)); + rtm = nlmsg_data(nlh); + fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0619ac708..182b6a9be 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -479,47 +479,12 @@ static void ipip6_tunnel_uninit(struct net_device *dev) dev_put(dev); } -/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH - * if sufficient data bytes are available - */ -static int ipip6_err_gen_icmpv6_unreach(struct sk_buff *skb) -{ - int ihl = ((const struct iphdr *)skb->data)->ihl*4; - struct rt6_info *rt; - struct sk_buff *skb2; - - if (!pskb_may_pull(skb, ihl + sizeof(struct ipv6hdr) + 8)) - return 1; - - skb2 = skb_clone(skb, GFP_ATOMIC); - - if (!skb2) - return 1; - - skb_dst_drop(skb2); - skb_pull(skb2, ihl); - skb_reset_network_header(skb2); - - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); - - if (rt && rt->dst.dev) - skb2->dev = rt->dst.dev; - - icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); - - if (rt) - ip6_rt_put(rt); - - kfree_skb(skb2); - - return 0; -} - static int ipip6_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + unsigned int data_len = 0; struct ip_tunnel *t; int err; @@ -544,6 +509,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: break; @@ -571,11 +537,11 @@ static int ipip6_err(struct sk_buff *skb, u32 info) goto out; } - if (t->parms.iph.daddr == 0) + err = 0; + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len)) goto out; - err = 0; - if (!ipip6_err_gen_icmpv6_unreach(skb)) + if (t->parms.iph.daddr == 0) goto out; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) @@ -722,12 +688,19 @@ out: return 0; } -static const struct tnl_ptk_info tpi = { +static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; -static int ipip_rcv(struct sk_buff *skb) +#if IS_ENABLED(CONFIG_MPLS) +static const struct tnl_ptk_info mplsip_tpi = { + /* no tunnel info required for mplsip. */ + .proto = htons(ETH_P_MPLS_UC), +}; +#endif + +static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { const struct iphdr *iph; struct ip_tunnel *tunnel; @@ -736,15 +709,23 @@ static int ipip_rcv(struct sk_buff *skb) tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel) { - if (tunnel->parms.iph.protocol != IPPROTO_IPIP && + const struct tnl_ptk_info *tpi; + + if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto, false)) +#if IS_ENABLED(CONFIG_MPLS) + if (ipproto == IPPROTO_MPLS) + tpi = &mplsip_tpi; + else +#endif + tpi = &ipip_tpi; + if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return 1; @@ -754,6 +735,18 @@ drop: return 0; } +static int ipip_rcv(struct sk_buff *skb) +{ + return sit_tunnel_rcv(skb, IPPROTO_IPIP); +} + +#if IS_ENABLED(CONFIG_MPLS) +static int mplsip_rcv(struct sk_buff *skb) +{ + return sit_tunnel_rcv(skb, IPPROTO_MPLS); +} +#endif + /* * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function * stores the embedded IPv4 address in v4dst and returns true. @@ -825,9 +818,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); - if (skb->protocol != htons(ETH_P_IPV6)) - goto tx_error; - if (tos == 1) tos = ipv6_get_dsfield(iph6); @@ -995,7 +985,8 @@ tx_error: return NETDEV_TX_OK; } -static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb, + struct net_device *dev, u8 ipproto) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; @@ -1003,9 +994,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; - skb_set_inner_ipproto(skb, IPPROTO_IPIP); + skb_set_inner_ipproto(skb, ipproto); - ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); + ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); @@ -1018,11 +1009,16 @@ static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, { switch (skb->protocol) { case htons(ETH_P_IP): - ipip_tunnel_xmit(skb, dev); + sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); break; case htons(ETH_P_IPV6): ipip6_tunnel_xmit(skb, dev); break; +#if IS_ENABLED(CONFIG_MPLS) + case htons(ETH_P_MPLS_UC): + sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS); + break; +#endif default: goto tx_err; } @@ -1130,6 +1126,16 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, } #endif +bool ipip6_valid_ip_proto(u8 ipproto) +{ + return ipproto == IPPROTO_IPV6 || + ipproto == IPPROTO_IPIP || +#if IS_ENABLED(CONFIG_MPLS) + ipproto == IPPROTO_MPLS || +#endif + ipproto == 0; +} + static int ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -1189,9 +1195,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) goto done; err = -EINVAL; - if (p.iph.protocol != IPPROTO_IPV6 && - p.iph.protocol != IPPROTO_IPIP && - p.iph.protocol != 0) + if (!ipip6_valid_ip_proto(p.iph.protocol)) goto done; if (p.iph.version != 4 || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) @@ -1416,9 +1420,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); - if (proto != IPPROTO_IPV6 && - proto != IPPROTO_IPIP && - proto != 0) + if (!ipip6_valid_ip_proto(proto)) return -EINVAL; return 0; @@ -1760,6 +1762,14 @@ static struct xfrm_tunnel ipip_handler __read_mostly = { .priority = 2, }; +#if IS_ENABLED(CONFIG_MPLS) +static struct xfrm_tunnel mplsip_handler __read_mostly = { + .handler = mplsip_rcv, + .err_handler = ipip6_err, + .priority = 2, +}; +#endif + static void __net_exit sit_destroy_tunnels(struct net *net, struct list_head *head) { @@ -1855,6 +1865,9 @@ static void __exit sit_cleanup(void) rtnl_link_unregister(&sit_link_ops); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +#if IS_ENABLED(CONFIG_MPLS) + xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); +#endif unregister_pernet_device(&sit_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ @@ -1864,7 +1877,7 @@ static int __init sit_init(void) { int err; - pr_info("IPv6 over IPv4 tunneling driver\n"); + pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&sit_net_ops); if (err < 0) @@ -1879,6 +1892,13 @@ static int __init sit_init(void) pr_info("%s: can't register ip4ip4\n", __func__); goto xfrm_tunnel4_failed; } +#if IS_ENABLED(CONFIG_MPLS) + err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); + if (err < 0) { + pr_info("%s: can't register mplsip\n", __func__); + goto xfrm_tunnel_mpls_failed; + } +#endif err = rtnl_link_register(&sit_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1887,6 +1907,10 @@ out: return err; rtnl_link_failed: +#if IS_ENABLED(CONFIG_MPLS) + xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); +xfrm_tunnel_mpls_failed: +#endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel4_failed: xfrm4_tunnel_deregister(&sit_handler, AF_INET6); diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 45243bbe5..69c50e737 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -15,6 +15,9 @@ #include #include #include +#ifdef CONFIG_NETLABEL +#include +#endif static int one = 1; static int auto_flowlabels_min; @@ -106,6 +109,22 @@ static struct ctl_table ipv6_rotable[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one }, +#ifdef CONFIG_NETLABEL + { + .procname = "calipso_cache_enable", + .data = &calipso_cache_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "calipso_cache_bucket_size", + .data = &calipso_cache_bucketsize, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NETLABEL */ { } }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a2aaca9be..27a968b88 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -459,6 +459,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; @@ -479,8 +480,10 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); rcu_read_lock(); - err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), - np->tclass); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); + err = ip6_xmit(sk, skb, fl6, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -492,6 +495,7 @@ done: static void tcp_v6_reqsk_destructor(struct request_sock *req) { + kfree(inet_rsk(req)->ipv6_opt); kfree_skb(inet_rsk(req)->pktopts); } @@ -542,26 +546,33 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval, AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } -static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, - const struct in6_addr *daddr, - const struct in6_addr *saddr, int nbytes) +static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + const struct tcphdr *th, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; + struct tcphdr *_th; - bp = &hp->md5_blk.ip6; + bp = hp->scratch; /* 1. TCP pseudo-header (RFC2460) */ bp->saddr = *saddr; bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); - sg_init_one(&sg, bp, sizeof(*bp)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + _th = (struct tcphdr *)(bp + 1); + memcpy(_th, th, sizeof(*th)); + _th->check = 0; + + sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, + sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->md5_req); } -static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, +static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, const struct in6_addr *daddr, struct in6_addr *saddr, const struct tcphdr *th) { @@ -575,9 +586,7 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; @@ -622,9 +631,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) goto clear_hash; @@ -1131,7 +1138,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * but we make one more one thing there: reattach optmem to newsk. */ - opt = rcu_dereference(np->opt); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 42a2edf7c..19ac3a1c3 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1207,6 +1207,11 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; + + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -1217,9 +1222,6 @@ do_udp_sendmsg: if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: @@ -1458,7 +1460,6 @@ struct proto udpv6_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 9cf097e20..fd6ef4148 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -50,7 +50,6 @@ struct proto udplitev6_prot = { .unhash = udp_lib_unhash, .get_port = udp_v6_get_port, .obj_size = sizeof(struct udp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 0eaab1fa6..b5789562a 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -21,8 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm6_extract_header(skb); } -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t) { + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); @@ -48,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return -1; } -int xfrm6_rcv(struct sk_buff *skb) +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], - 0); + 0, t); } -EXPORT_SYMBOL(xfrm6_rcv); +EXPORT_SYMBOL(xfrm6_rcv_tnl); +int xfrm6_rcv(struct sk_buff *skb) +{ + return xfrm6_rcv_tnl(skb, NULL); +} +EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index c074771a1..70a86adad 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -36,7 +36,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, int err; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = oif; + fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif); fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) @@ -366,12 +366,12 @@ static void __net_exit xfrm6_net_sysctl_exit(struct net *net) kfree(table); } #else /* CONFIG_SYSCTL */ -static int inline xfrm6_net_sysctl_init(struct net *net) +static inline int xfrm6_net_sysctl_init(struct net *net) { return 0; } -static void inline xfrm6_net_sysctl_exit(struct net *net) +static inline void xfrm6_net_sysctl_exit(struct net *net) { } #endif diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 5743044cd..e1c0bbe79 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb) __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 8d2f7c9b4..ccc244406 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -832,7 +832,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) struct sock *sk = sock->sk; struct irda_sock *new, *self = irda_sk(sk); struct sock *newsk; - struct sk_buff *skb; + struct sk_buff *skb = NULL; int err; err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); @@ -900,7 +900,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) err = -EPERM; /* value does not seem to make sense. -arnd */ if (!new->tsap) { pr_debug("%s(), dup failed!\n", __func__); - kfree_skb(skb); goto out; } @@ -919,7 +918,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) /* Clean up the original one to keep it in listen state */ irttp_listen(self->tsap); - kfree_skb(skb); sk->sk_ack_backlog--; newsock->state = SS_CONNECTED; @@ -927,6 +925,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) irda_connect_response(new); err = 0; out: + kfree_skb(skb); release_sock(sk); return err; } diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c index d4fdf8f7b..8f5678cb6 100644 --- a/net/irda/ircomm/ircomm_tty_ioctl.c +++ b/net/irda/ircomm/ircomm_tty_ioctl.c @@ -246,9 +246,6 @@ static int ircomm_tty_get_serial_info(struct ircomm_tty_cb *self, { struct serial_struct info; - if (!retinfo) - return -EFAULT; - memset(&info, 0, sizeof(info)); info.line = self->line; info.flags = self->port.flags; @@ -258,11 +255,6 @@ static int ircomm_tty_get_serial_info(struct ircomm_tty_cb *self, /* For compatibility */ info.type = PORT_16550A; - info.port = 0; - info.irq = 0; - info.xmit_fifo_size = 0; - info.hub6 = 0; - info.custom_divisor = 0; if (copy_to_user(retinfo, &info, sizeof(*retinfo))) return -EFAULT; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index fc3598a92..02b45a8e8 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -530,8 +531,10 @@ static void iucv_sock_close(struct sock *sk) static void iucv_sock_init(struct sock *sk, struct sock *parent) { - if (parent) + if (parent) { sk->sk_type = parent->sk_type; + security_sk_clone(parent, sk); + } } static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern) @@ -1033,6 +1036,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); + size_t headroom, linear; struct sk_buff *skb; struct iucv_message txmsg = {0}; struct cmsghdr *cmsg; @@ -1110,20 +1114,31 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, * this is fine for SOCK_SEQPACKET (unless we want to support * segmented records using the MSG_EOR flag), but * for SOCK_STREAM we might want to improve it in future */ - if (iucv->transport == AF_IUCV_TRANS_HIPER) - skb = sock_alloc_send_skb(sk, - len + sizeof(struct af_iucv_trans_hdr) + ETH_HLEN, - noblock, &err); - else - skb = sock_alloc_send_skb(sk, len, noblock, &err); + headroom = (iucv->transport == AF_IUCV_TRANS_HIPER) + ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0; + if (headroom + len < PAGE_SIZE) { + linear = len; + } else { + /* In nonlinear "classic" iucv skb, + * reserve space for iucv_array + */ + if (iucv->transport != AF_IUCV_TRANS_HIPER) + headroom += sizeof(struct iucv_array) * + (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } + skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear, + noblock, &err, 0); if (!skb) goto out; - if (iucv->transport == AF_IUCV_TRANS_HIPER) - skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN); - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - err = -EFAULT; + if (headroom) + skb_reserve(skb, headroom); + skb_put(skb, linear); + skb->len = len; + skb->data_len = len - linear; + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); + if (err) goto fail; - } /* wait if outstanding messages for iucv path has reached */ timeo = sock_sndtimeo(sk, noblock); @@ -1148,49 +1163,67 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, atomic_dec(&iucv->msg_sent); goto fail; } - goto release; - } - skb_queue_tail(&iucv->send_skb_q, skb); - - if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) - && skb->len <= 7) { - err = iucv_send_iprm(iucv->path, &txmsg, skb); + } else { /* Classic VM IUCV transport */ + skb_queue_tail(&iucv->send_skb_q, skb); + + if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) && + skb->len <= 7) { + err = iucv_send_iprm(iucv->path, &txmsg, skb); + + /* on success: there is no message_complete callback */ + /* for an IPRMDATA msg; remove skb from send queue */ + if (err == 0) { + skb_unlink(skb, &iucv->send_skb_q); + kfree_skb(skb); + } - /* on success: there is no message_complete callback - * for an IPRMDATA msg; remove skb from send queue */ - if (err == 0) { - skb_unlink(skb, &iucv->send_skb_q); - kfree_skb(skb); + /* this error should never happen since the */ + /* IUCV_IPRMDATA path flag is set... sever path */ + if (err == 0x15) { + pr_iucv->path_sever(iucv->path, NULL); + skb_unlink(skb, &iucv->send_skb_q); + err = -EPIPE; + goto fail; + } + } else if (skb_is_nonlinear(skb)) { + struct iucv_array *iba = (struct iucv_array *)skb->head; + int i; + + /* skip iucv_array lying in the headroom */ + iba[0].address = (u32)(addr_t)skb->data; + iba[0].length = (u32)skb_headlen(skb); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + iba[i + 1].address = + (u32)(addr_t)skb_frag_address(frag); + iba[i + 1].length = (u32)skb_frag_size(frag); + } + err = pr_iucv->message_send(iucv->path, &txmsg, + IUCV_IPBUFLST, 0, + (void *)iba, skb->len); + } else { /* non-IPRM Linear skb */ + err = pr_iucv->message_send(iucv->path, &txmsg, + 0, 0, (void *)skb->data, skb->len); } - - /* this error should never happen since the - * IUCV_IPRMDATA path flag is set... sever path */ - if (err == 0x15) { - pr_iucv->path_sever(iucv->path, NULL); + if (err) { + if (err == 3) { + user_id[8] = 0; + memcpy(user_id, iucv->dst_user_id, 8); + appl_id[8] = 0; + memcpy(appl_id, iucv->dst_name, 8); + pr_err( + "Application %s on z/VM guest %s exceeds message limit\n", + appl_id, user_id); + err = -EAGAIN; + } else { + err = -EPIPE; + } skb_unlink(skb, &iucv->send_skb_q); - err = -EPIPE; goto fail; } - } else - err = pr_iucv->message_send(iucv->path, &txmsg, 0, 0, - (void *) skb->data, skb->len); - if (err) { - if (err == 3) { - user_id[8] = 0; - memcpy(user_id, iucv->dst_user_id, 8); - appl_id[8] = 0; - memcpy(appl_id, iucv->dst_name, 8); - pr_err("Application %s on z/VM guest %s" - " exceeds message limit\n", - appl_id, user_id); - err = -EAGAIN; - } else - err = -EPIPE; - skb_unlink(skb, &iucv->send_skb_q); - goto fail; } -release: release_sock(sk); return len; @@ -1201,42 +1234,32 @@ out: return err; } -/* iucv_fragment_skb() - Fragment a single IUCV message into multiple skb's - * - * Locking: must be called with message_q.lock held - */ -static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len) +static struct sk_buff *alloc_iucv_recv_skb(unsigned long len) { - int dataleft, size, copied = 0; - struct sk_buff *nskb; - - dataleft = len; - while (dataleft) { - if (dataleft >= sk->sk_rcvbuf / 4) - size = sk->sk_rcvbuf / 4; - else - size = dataleft; - - nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA); - if (!nskb) - return -ENOMEM; - - /* copy target class to control buffer of new skb */ - IUCV_SKB_CB(nskb)->class = IUCV_SKB_CB(skb)->class; - - /* copy data fragment */ - memcpy(nskb->data, skb->data + copied, size); - copied += size; - dataleft -= size; - - skb_reset_transport_header(nskb); - skb_reset_network_header(nskb); - nskb->len = size; + size_t headroom, linear; + struct sk_buff *skb; + int err; - skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, nskb); + if (len < PAGE_SIZE) { + headroom = 0; + linear = len; + } else { + headroom = sizeof(struct iucv_array) * (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } + skb = alloc_skb_with_frags(headroom + linear, len - linear, + 0, &err, GFP_ATOMIC | GFP_DMA); + WARN_ONCE(!skb, + "alloc of recv iucv skb len=%lu failed with errcode=%d\n", + len, err); + if (skb) { + if (headroom) + skb_reserve(skb, headroom); + skb_put(skb, linear); + skb->len = len; + skb->data_len = len - linear; } - - return 0; + return skb; } /* iucv_process_message() - Receive a single outstanding IUCV message @@ -1263,31 +1286,32 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, skb->len = 0; } } else { - rc = pr_iucv->message_receive(path, msg, + if (skb_is_nonlinear(skb)) { + struct iucv_array *iba = (struct iucv_array *)skb->head; + int i; + + iba[0].address = (u32)(addr_t)skb->data; + iba[0].length = (u32)skb_headlen(skb); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + iba[i + 1].address = + (u32)(addr_t)skb_frag_address(frag); + iba[i + 1].length = (u32)skb_frag_size(frag); + } + rc = pr_iucv->message_receive(path, msg, + IUCV_IPBUFLST, + (void *)iba, len, NULL); + } else { + rc = pr_iucv->message_receive(path, msg, msg->flags & IUCV_IPRMDATA, skb->data, len, NULL); + } if (rc) { kfree_skb(skb); return; } - /* we need to fragment iucv messages for SOCK_STREAM only; - * for SOCK_SEQPACKET, it is only relevant if we support - * record segmentation using MSG_EOR (see also recvmsg()) */ - if (sk->sk_type == SOCK_STREAM && - skb->truesize >= sk->sk_rcvbuf / 4) { - rc = iucv_fragment_skb(sk, skb, len); - kfree_skb(skb); - skb = NULL; - if (rc) { - pr_iucv->path_sever(path, NULL); - return; - } - skb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q); - } else { - skb_reset_transport_header(skb); - skb_reset_network_header(skb); - skb->len = len; - } + WARN_ON_ONCE(skb->len != len); } IUCV_SKB_CB(skb)->offset = 0; @@ -1306,7 +1330,7 @@ static void iucv_process_message_q(struct sock *sk) struct sock_msg_q *p, *n; list_for_each_entry_safe(p, n, &iucv->message_q.list, list) { - skb = alloc_skb(iucv_msg_length(&p->msg), GFP_ATOMIC | GFP_DMA); + skb = alloc_iucv_recv_skb(iucv_msg_length(&p->msg)); if (!skb) break; iucv_process_message(sk, skb, p->path, &p->msg); @@ -1801,7 +1825,7 @@ static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg) if (len > sk->sk_rcvbuf) goto save_message; - skb = alloc_skb(iucv_msg_length(msg), GFP_ATOMIC | GFP_DMA); + skb = alloc_iucv_recv_skb(iucv_msg_length(msg)); if (!skb) goto save_message; diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 7eaa000c9..88a2a3ba4 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -320,21 +320,29 @@ static union iucv_param *iucv_param_irq[NR_CPUS]; * * Returns the result of the CP IUCV call. */ -static inline int iucv_call_b2f0(int command, union iucv_param *parm) +static inline int __iucv_call_b2f0(int command, union iucv_param *parm) { register unsigned long reg0 asm ("0"); register unsigned long reg1 asm ("1"); int ccode; reg0 = command; - reg1 = virt_to_phys(parm); + reg1 = (unsigned long)parm; asm volatile( " .long 0xb2f01000\n" " ipm %0\n" " srl %0,28\n" : "=d" (ccode), "=m" (*parm), "+d" (reg0), "+a" (reg1) : "m" (*parm) : "cc"); - return (ccode == 1) ? parm->ctrl.iprcode : ccode; + return ccode; +} + +static inline int iucv_call_b2f0(int command, union iucv_param *parm) +{ + int ccode; + + ccode = __iucv_call_b2f0(command, parm); + return ccode == 1 ? parm->ctrl.iprcode : ccode; } /** @@ -345,16 +353,12 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm) * Returns the maximum number of connections or -EPERM is IUCV is not * available. */ -static int iucv_query_maxconn(void) +static int __iucv_query_maxconn(void *param, unsigned long *max_pathid) { register unsigned long reg0 asm ("0"); register unsigned long reg1 asm ("1"); - void *param; int ccode; - param = kzalloc(sizeof(union iucv_param), GFP_KERNEL|GFP_DMA); - if (!param) - return -ENOMEM; reg0 = IUCV_QUERY; reg1 = (unsigned long) param; asm volatile ( @@ -362,8 +366,22 @@ static int iucv_query_maxconn(void) " ipm %0\n" " srl %0,28\n" : "=d" (ccode), "+d" (reg0), "+d" (reg1) : : "cc"); + *max_pathid = reg1; + return ccode; +} + +static int iucv_query_maxconn(void) +{ + unsigned long max_pathid; + void *param; + int ccode; + + param = kzalloc(sizeof(union iucv_param), GFP_KERNEL | GFP_DMA); + if (!param) + return -ENOMEM; + ccode = __iucv_query_maxconn(param, &max_pathid); if (ccode == 0) - iucv_max_pathid = reg1; + iucv_max_pathid = max_pathid; kfree(param); return ccode ? -EPERM : 0; } diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index fda7f4715..16c2e03bd 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -88,13 +88,9 @@ struct kcm_proc_mux_state { static int kcm_seq_open(struct inode *inode, struct file *file) { struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode); - int err; - err = seq_open_net(inode, file, &muxinfo->seq_ops, + return seq_open_net(inode, file, &muxinfo->seq_ops, sizeof(struct kcm_proc_mux_state)); - if (err < 0) - return err; - return err; } static void kcm_format_mux_header(struct seq_file *seq) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 48613f5dd..411693288 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1766,18 +1766,12 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) if (!csock) return -ENOENT; - prog = bpf_prog_get(info->bpf_fd); + prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out; } - if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(prog); - err = -EINVAL; - goto out; - } - err = kcm_attach(sock, csock, prog); if (err) { bpf_prog_put(prog); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 1e40dacaa..a2ed3bda4 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1855,6 +1855,9 @@ static __net_exit void l2tp_exit_net(struct net *net) (void)l2tp_tunnel_delete(tunnel); } rcu_read_unlock_bh(); + + flush_workqueue(l2tp_wq); + rcu_barrier(); } static struct pernet_operations l2tp_net_ops = { diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index e253c26f3..57fc5a46c 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -67,7 +67,6 @@ static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) return net_generic(net, l2tp_eth_net_id); } -static struct lock_class_key l2tp_eth_tx_busylock; static int l2tp_eth_dev_init(struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); @@ -75,7 +74,8 @@ static int l2tp_eth_dev_init(struct net_device *dev) priv->dev = dev; eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); - dev->qdisc_tx_busylock = &l2tp_eth_tx_busylock; + netdev_lockdep_set_classes(dev); + return 0; } diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 6c54e03fe..ea2ae6664 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -611,6 +611,11 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; + + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -620,9 +625,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - if (ipc6.dontfrag < 0) ipc6.dontfrag = np->dontfrag; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 652c250b9..232cb9203 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -856,7 +856,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, error = -ENOTCONN; if (sk == NULL) goto end; - if (sk->sk_state != PPPOX_CONNECTED) + if (!(sk->sk_state & PPPOX_CONNECTED)) goto end; error = -EBADF; @@ -866,10 +866,8 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, pls = l2tp_session_priv(session); tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock); - if (tunnel == NULL) { - error = -EBADF; + if (tunnel == NULL) goto end_put_sess; - } inet = inet_sk(tunnel->sock); if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) { @@ -947,12 +945,11 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, } *usockaddr_len = len; + error = 0; sock_put(pls->tunnel_sock); end_put_sess: sock_put(sk); - error = 0; - end: return error; } diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 6651a78e1..c4a1c3e84 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -10,6 +10,7 @@ */ #include +#include #include /** @@ -107,7 +108,7 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); */ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, - const struct flowi6 *fl6) + struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; @@ -160,3 +161,64 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) return rc; } EXPORT_SYMBOL_GPL(l3mdev_get_saddr); + +int l3mdev_get_saddr6(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + struct net_device *dev; + int rc = 0; + + if (fl6->flowi6_oif) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (dev && netif_is_l3_slave(dev)) + dev = netdev_master_upper_dev_get_rcu(dev); + + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_get_saddr6) + rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6); + + rcu_read_unlock(); + } + + return rc; +} +EXPORT_SYMBOL_GPL(l3mdev_get_saddr6); + +/** + * l3mdev_fib_rule_match - Determine if flowi references an + * L3 master device + * @net: network namespace for device index lookup + * @fl: flow struct + */ + +int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, + struct fib_lookup_arg *arg) +{ + struct net_device *dev; + int rc = 0; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl->flowi_oif); + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_fib_table) { + arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); + rc = 1; + goto out; + } + + dev = dev_get_by_index_rcu(net, fl->flowi_iif); + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_fib_table) { + arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); + rc = 1; + goto out; + } + +out: + rcu_read_unlock(); + + return rc; +} diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 0a7305a97..afa94687d 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -312,6 +312,24 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, mutex_lock(&sta->ampdu_mlme.mtx); if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { + tid_agg_rx = rcu_dereference_protected( + sta->ampdu_mlme.tid_rx[tid], + lockdep_is_held(&sta->ampdu_mlme.mtx)); + + if (tid_agg_rx->dialog_token == dialog_token) { + ht_dbg_ratelimited(sta->sdata, + "updated AddBA Req from %pM on tid %u\n", + sta->sta.addr, tid); + /* We have no API to update the timeout value in the + * driver so reject the timeout update. + */ + status = WLAN_STATUS_REQUEST_DECLINED; + ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, + tid, dialog_token, status, + 1, buf_size, timeout); + goto end; + } + ht_dbg_ratelimited(sta->sdata, "unexpected AddBA Req from %pM on tid %u\n", sta->sta.addr, tid); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 744ad1c0b..45319cc01 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -194,17 +194,21 @@ static void ieee80211_agg_stop_txq(struct sta_info *sta, int tid) { struct ieee80211_txq *txq = sta->sta.txq[tid]; + struct ieee80211_sub_if_data *sdata; + struct fq *fq; struct txq_info *txqi; if (!txq) return; txqi = to_txq_info(txq); + sdata = vif_to_sdata(txq->vif); + fq = &sdata->local->fq; /* Lock here to protect against further seqno updates on dequeue */ - spin_lock_bh(&txqi->queue.lock); + spin_lock_bh(&fq->lock); set_bit(IEEE80211_TXQ_STOP, &txqi->flags); - spin_unlock_bh(&txqi->queue.lock); + spin_unlock_bh(&fq->lock); } static void diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 8cc49c044..543b1d4fc 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -997,6 +997,7 @@ static void sta_apply_mesh_params(struct ieee80211_local *local, if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) changed = mesh_plink_inc_estab_count(sdata); sta->mesh->plink_state = params->plink_state; + sta->mesh->aid = params->peer_aid; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index b251b2f7f..2906c1004 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -10,6 +10,7 @@ #include #include +#include #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" @@ -70,6 +71,177 @@ DEBUGFS_READONLY_FILE(wep_iv, "%#08x", DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s", local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver"); +struct aqm_info { + struct ieee80211_local *local; + size_t size; + size_t len; + unsigned char buf[0]; +}; + +#define AQM_HDR_LEN 200 +#define AQM_HW_ENTRY_LEN 40 +#define AQM_TXQ_ENTRY_LEN 110 + +static int aqm_open(struct inode *inode, struct file *file) +{ + struct ieee80211_local *local = inode->i_private; + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + struct txq_info *txqi; + struct fq *fq = &local->fq; + struct aqm_info *info = NULL; + int len = 0; + int i; + + if (!local->ops->wake_tx_queue) + return -EOPNOTSUPP; + + len += AQM_HDR_LEN; + len += 6 * AQM_HW_ENTRY_LEN; + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) + len += AQM_TXQ_ENTRY_LEN; + list_for_each_entry_rcu(sta, &local->sta_list, list) + len += AQM_TXQ_ENTRY_LEN * ARRAY_SIZE(sta->sta.txq); + rcu_read_unlock(); + + info = vmalloc(len); + if (!info) + return -ENOMEM; + + spin_lock_bh(&local->fq.lock); + rcu_read_lock(); + + file->private_data = info; + info->local = local; + info->size = len; + len = 0; + + len += scnprintf(info->buf + len, info->size - len, + "* hw\n" + "access name value\n" + "R fq_flows_cnt %u\n" + "R fq_backlog %u\n" + "R fq_overlimit %u\n" + "R fq_collisions %u\n" + "RW fq_limit %u\n" + "RW fq_quantum %u\n", + fq->flows_cnt, + fq->backlog, + fq->overlimit, + fq->collisions, + fq->limit, + fq->quantum); + + len += scnprintf(info->buf + len, + info->size - len, + "* vif\n" + "ifname addr ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n"); + + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + txqi = to_txq_info(sdata->vif.txq); + len += scnprintf(info->buf + len, info->size - len, + "%s %pM %u %u %u %u %u %u %u %u\n", + sdata->name, + sdata->vif.addr, + txqi->txq.ac, + txqi->tin.backlog_bytes, + txqi->tin.backlog_packets, + txqi->tin.flows, + txqi->tin.overlimit, + txqi->tin.collisions, + txqi->tin.tx_bytes, + txqi->tin.tx_packets); + } + + len += scnprintf(info->buf + len, + info->size - len, + "* sta\n" + "ifname addr tid ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n"); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + sdata = sta->sdata; + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + txqi = to_txq_info(sta->sta.txq[i]); + len += scnprintf(info->buf + len, info->size - len, + "%s %pM %d %d %u %u %u %u %u %u %u\n", + sdata->name, + sta->sta.addr, + txqi->txq.tid, + txqi->txq.ac, + txqi->tin.backlog_bytes, + txqi->tin.backlog_packets, + txqi->tin.flows, + txqi->tin.overlimit, + txqi->tin.collisions, + txqi->tin.tx_bytes, + txqi->tin.tx_packets); + } + } + + info->len = len; + + rcu_read_unlock(); + spin_unlock_bh(&local->fq.lock); + + return 0; +} + +static int aqm_release(struct inode *inode, struct file *file) +{ + vfree(file->private_data); + return 0; +} + +static ssize_t aqm_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct aqm_info *info = file->private_data; + + return simple_read_from_buffer(user_buf, count, ppos, + info->buf, info->len); +} + +static ssize_t aqm_write(struct file *file, + const char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct aqm_info *info = file->private_data; + struct ieee80211_local *local = info->local; + char buf[100]; + size_t len; + + if (count > sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + buf[sizeof(buf) - 1] = '\0'; + len = strlen(buf); + if (len > 0 && buf[len-1] == '\n') + buf[len-1] = 0; + + if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) + return count; + else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1) + return count; + + return -EINVAL; +} + +static const struct file_operations aqm_ops = { + .write = aqm_write, + .read = aqm_read, + .open = aqm_open, + .release = aqm_release, + .llseek = default_llseek, +}; + #ifdef CONFIG_PM static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) @@ -256,6 +428,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); + DEBUGFS_ADD_MODE(aqm, 0600); statsd = debugfs_create_dir("statistics", phyd); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 33dfcbc2b..fd334133f 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -328,14 +328,88 @@ STA_OPS(ht_capa); static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - char buf[128], *p = buf; + char buf[512], *p = buf; struct sta_info *sta = file->private_data; struct ieee80211_sta_vht_cap *vhtc = &sta->sta.vht_cap; p += scnprintf(p, sizeof(buf) + buf - p, "VHT %ssupported\n", vhtc->vht_supported ? "" : "not "); if (vhtc->vht_supported) { - p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.8x\n", vhtc->cap); + p += scnprintf(p, sizeof(buf) + buf - p, "cap: %#.8x\n", + vhtc->cap); +#define PFLAG(a, b) \ + do { \ + if (vhtc->cap & IEEE80211_VHT_CAP_ ## a) \ + p += scnprintf(p, sizeof(buf) + buf - p, \ + "\t\t%s\n", b); \ + } while (0) + + switch (vhtc->cap & 0x3) { + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMAX-MPDU-3895\n"); + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMAX-MPDU-7991\n"); + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMAX-MPDU-11454\n"); + break; + default: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMAX-MPDU-UNKNOWN\n"); + }; + switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { + case 0: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\t80Mhz\n"); + break; + case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\t160Mhz\n"); + break; + case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\t80+80Mhz\n"); + break; + default: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tUNKNOWN-MHZ: 0x%x\n", + (vhtc->cap >> 2) & 0x3); + }; + PFLAG(RXLDPC, "RXLDPC"); + PFLAG(SHORT_GI_80, "SHORT-GI-80"); + PFLAG(SHORT_GI_160, "SHORT-GI-160"); + PFLAG(TXSTBC, "TXSTBC"); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tRXSTBC_%d\n", (vhtc->cap >> 8) & 0x7); + PFLAG(SU_BEAMFORMER_CAPABLE, "SU-BEAMFORMER-CAPABLE"); + PFLAG(SU_BEAMFORMEE_CAPABLE, "SU-BEAMFORMEE-CAPABLE"); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tBEAMFORMEE-STS: 0x%x\n", + (vhtc->cap & IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK) >> + IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tSOUNDING-DIMENSIONS: 0x%x\n", + (vhtc->cap & IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK) + >> IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT); + PFLAG(MU_BEAMFORMER_CAPABLE, "MU-BEAMFORMER-CAPABLE"); + PFLAG(MU_BEAMFORMEE_CAPABLE, "MU-BEAMFORMEE-CAPABLE"); + PFLAG(VHT_TXOP_PS, "TXOP-PS"); + PFLAG(HTC_VHT, "HTC-VHT"); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMPDU-LENGTH-EXPONENT: 0x%x\n", + (vhtc->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK) >> + IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT); + PFLAG(VHT_LINK_ADAPTATION_VHT_UNSOL_MFB, + "LINK-ADAPTATION-VHT-UNSOL-MFB"); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tLINK-ADAPTATION-VHT-MRQ-MFB: 0x%x\n", + (vhtc->cap & IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB) >> 26); + PFLAG(RX_ANTENNA_PATTERN, "RX-ANTENNA-PATTERN"); + PFLAG(TX_ANTENNA_PATTERN, "TX-ANTENNA-PATTERN"); p += scnprintf(p, sizeof(buf)+buf-p, "RX MCS: %.4x\n", le16_to_cpu(vhtc->vht_mcs.rx_mcs_map)); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 184473c25..ba5fc1f01 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1094,7 +1094,7 @@ static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, trace_drv_get_expected_throughput(sta); if (local->ops->get_expected_throughput) - ret = local->ops->get_expected_throughput(sta); + ret = local->ops->get_expected_throughput(&local->hw, sta); trace_drv_return_u32(local, ret); return ret; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9438c9406..f56d342c3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -30,6 +30,7 @@ #include #include #include +#include #include "key.h" #include "sta_info.h" #include "debug.h" @@ -805,10 +806,19 @@ enum txq_info_flags { IEEE80211_TXQ_NO_AMSDU, }; +/** + * struct txq_info - per tid queue + * + * @tin: contains packets split into multiple flows + * @def_flow: used as a fallback flow when a packet destined to @tin hashes to + * a fq_flow which is already owned by a different tin + * @def_cvars: codel vars for @def_flow + */ struct txq_info { - struct sk_buff_head queue; + struct fq_tin tin; + struct fq_flow def_flow; + struct codel_vars def_cvars; unsigned long flags; - unsigned long byte_cnt; /* keep last! */ struct ieee80211_txq txq; @@ -856,7 +866,7 @@ struct ieee80211_sub_if_data { bool control_port_no_encrypt; int encrypt_headroom; - atomic_t txqs_len[IEEE80211_NUM_ACS]; + atomic_t num_tx_queued; struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct mac80211_qos_map __rcu *qos_map; @@ -1099,6 +1109,11 @@ struct ieee80211_local { * it first anyway so they become a no-op */ struct ieee80211_hw hw; + struct fq fq; + struct codel_vars *cvars; + struct codel_params cparams; + struct codel_stats cstats; + const struct ieee80211_ops *ops; /* @@ -1235,6 +1250,7 @@ struct ieee80211_local { int scan_channel_idx; int scan_ies_len; int hw_scan_ies_bufsize; + struct cfg80211_scan_info scan_info; struct work_struct sched_scan_stopped_work; struct ieee80211_sub_if_data __rcu *sched_scan_sdata; @@ -1931,9 +1947,13 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) return true; } -void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, - struct txq_info *txq, int tid); +int ieee80211_txq_setup_flows(struct ieee80211_local *local); +void ieee80211_txq_teardown_flows(struct ieee80211_local *local); +void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct txq_info *txq, int tid); +void ieee80211_txq_purge(struct ieee80211_local *local, + struct txq_info *txqi); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index c59af3eb9..b123a9e32 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -779,6 +779,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { struct ieee80211_local *local = sdata->local; + struct fq *fq = &local->fq; unsigned long flags; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; @@ -977,12 +978,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (sdata->vif.txq) { struct txq_info *txqi = to_txq_info(sdata->vif.txq); - spin_lock_bh(&txqi->queue.lock); - ieee80211_purge_tx_queue(&local->hw, &txqi->queue); - txqi->byte_cnt = 0; - spin_unlock_bh(&txqi->queue.lock); - - atomic_set(&sdata->txqs_len[txqi->txq.ac], 0); + spin_lock_bh(&fq->lock); + ieee80211_txq_purge(local, txqi); + spin_unlock_bh(&fq->lock); } if (local->open_count == 0) @@ -1198,6 +1196,12 @@ static void ieee80211_if_setup(struct net_device *dev) dev->destructor = ieee80211_if_free; } +static void ieee80211_if_setup_no_queue(struct net_device *dev) +{ + ieee80211_if_setup(dev); + dev->priv_flags |= IFF_NO_QUEUE; +} + static void ieee80211_iface_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = @@ -1707,6 +1711,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, struct net_device *ndev = NULL; struct ieee80211_sub_if_data *sdata = NULL; struct txq_info *txqi; + void (*if_setup)(struct net_device *dev); int ret, i; int txqs = 1; @@ -1734,12 +1739,17 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; + if (local->ops->wake_tx_queue) + if_setup = ieee80211_if_setup_no_queue; + else + if_setup = ieee80211_if_setup; + if (local->hw.queues >= IEEE80211_NUM_ACS) txqs = IEEE80211_NUM_ACS; ndev = alloc_netdev_mqs(size + txq_size, name, name_assign_type, - ieee80211_if_setup, txqs, 1); + if_setup, txqs, 1); if (!ndev) return -ENOMEM; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); @@ -1780,7 +1790,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (txq_size) { txqi = netdev_priv(ndev) + size; - ieee80211_init_tx_queue(sdata, NULL, txqi, 0); + ieee80211_txq_init(sdata, NULL, txqi, 0); } sdata->dev = ndev; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 7ee91d615..d00ea9b13 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1055,9 +1055,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->dynamic_ps_forced_timeout = -1; - if (!local->hw.txq_ac_max_pending) - local->hw.txq_ac_max_pending = 64; - result = ieee80211_wep_init(local); if (result < 0) wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", @@ -1089,6 +1086,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); + result = ieee80211_txq_setup_flows(local); + if (result) + goto fail_flows; + #ifdef CONFIG_INET local->ifa_notifier.notifier_call = ieee80211_ifa_changed; result = register_inetaddr_notifier(&local->ifa_notifier); @@ -1114,6 +1115,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) #if defined(CONFIG_INET) || defined(CONFIG_IPV6) fail_ifa: #endif + ieee80211_txq_teardown_flows(local); + fail_flows: rtnl_lock(); rate_control_deinitialize(local); ieee80211_remove_interfaces(local); @@ -1172,6 +1175,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); skb_queue_purge(&local->skb_queue_tdls_chsw); + ieee80211_txq_teardown_flows(local); destroy_workqueue(local->workqueue); wiphy_unregister(local->hw.wiphy); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6a1603bcd..42120d965 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -148,25 +148,7 @@ u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata) void mesh_sta_cleanup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed = 0; - - /* - * maybe userspace handles peer allocation and peering, but in either - * case the beacon is still generated by the kernel and we might need - * an update. - */ - if (sdata->u.mesh.user_mpm && - sta->mesh->plink_state == NL80211_PLINK_ESTAB) - changed |= mesh_plink_dec_estab_count(sdata); - changed |= mesh_accept_plinks_update(sdata); - if (!sdata->u.mesh.user_mpm) { - changed |= mesh_plink_deactivate(sta); - del_timer_sync(&sta->mesh->plink_timer); - } - - /* make sure no readers can access nexthop sta from here on */ - mesh_path_flush_by_nexthop(sta); - synchronize_net(); + u32 changed = mesh_plink_deactivate(sta); if (changed) ieee80211_mbss_info_change_notify(sdata, changed); @@ -899,20 +881,22 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) netif_carrier_off(sdata->dev); + /* flush STAs and mpaths on this iface */ + sta_info_flush(sdata); + mesh_path_flush_by_iface(sdata); + /* stop the beacon */ ifmsh->mesh_id_len = 0; sdata->vif.bss_conf.enable_beacon = false; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); + + /* remove beacon */ bcn = rcu_dereference_protected(ifmsh->beacon, lockdep_is_held(&sdata->wdev.mtx)); RCU_INIT_POINTER(ifmsh->beacon, NULL); kfree_rcu(bcn, rcu_head); - /* flush STAs and mpaths on this iface */ - sta_info_flush(sdata); - mesh_path_flush_by_iface(sdata); - /* free all potentially still buffered group-addressed frames */ local->total_ps_buffered -= skb_queue_len(&ifmsh->ps.bc_buf); skb_queue_purge(&ifmsh->ps.bc_buf); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 8f9c3bde8..faccef977 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -746,6 +746,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, sta = next_hop_deref_protected(mpath); if (mpath->flags & MESH_PATH_ACTIVE && ether_addr_equal(ta, sta->sta.addr) && + !(mpath->flags & MESH_PATH_FIXED) && (!(mpath->flags & MESH_PATH_SN_VALID) || SN_GT(target_sn, mpath->sn) || target_sn == 0)) { mpath->flags &= ~MESH_PATH_ACTIVE; @@ -1012,7 +1013,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) goto enddiscovery; spin_lock_bh(&mpath->state_lock); - if (mpath->flags & MESH_PATH_DELETED) { + if (mpath->flags & (MESH_PATH_DELETED | MESH_PATH_FIXED)) { spin_unlock_bh(&mpath->state_lock); goto enddiscovery; } diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 6db2ddfa0..f0e6175a9 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -826,7 +826,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop) mpath->metric = 0; mpath->hop_count = 0; mpath->exp_time = 0; - mpath->flags |= MESH_PATH_FIXED; + mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); mesh_path_tx_pending(mpath); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 79f2a0a13..7fcdcf622 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -370,13 +370,21 @@ u32 mesh_plink_deactivate(struct sta_info *sta) spin_lock_bh(&sta->mesh->plink_lock); changed = __mesh_plink_deactivate(sta); - sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; - mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE, - sta->sta.addr, sta->mesh->llid, sta->mesh->plid, - sta->mesh->reason); + + if (!sdata->u.mesh.user_mpm) { + sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; + mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE, + sta->sta.addr, sta->mesh->llid, + sta->mesh->plid, sta->mesh->reason); + } spin_unlock_bh(&sta->mesh->plink_lock); + if (!sdata->u.mesh.user_mpm) + del_timer_sync(&sta->mesh->plink_timer); mesh_path_flush_by_nexthop(sta); + /* make sure no readers can access nexthop sta from here on */ + synchronize_net(); + return changed; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5e65e8389..9dce3b157 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1268,7 +1268,7 @@ static void sta_ps_start(struct sta_info *sta) for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - if (!skb_queue_len(&txqi->queue)) + if (txqi->tin.backlog_packets) set_bit(tid, &sta->txq_buffered_tids); else clear_bit(tid, &sta->txq_buffered_tids); @@ -1624,8 +1624,13 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) return RX_DROP_MONITOR; /* unexpected BIP keyidx */ - if (rx->sta) + if (rx->sta) { + if (ieee80211_is_group_privacy_action(skb) && + test_sta_flag(rx->sta, WLAN_STA_MFP)) + return RX_DROP_MONITOR; + rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]); + } if (!rx->key) rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]); } else if (!ieee80211_has_protected(fc)) { diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f9648ef9e..070b40f15 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -7,6 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2013-2015 Intel Mobile Communications GmbH + * Copyright 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -70,6 +71,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local, .boottime_ns = rx_status->boottime_ns, }; bool signal_valid; + struct ieee80211_sub_if_data *scan_sdata; if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; @@ -83,6 +85,20 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10; bss_meta.chan = channel; + + rcu_read_lock(); + scan_sdata = rcu_dereference(local->scan_sdata); + if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && + scan_sdata->vif.bss_conf.assoc && + ieee80211_have_rx_timestamp(rx_status)) { + bss_meta.parent_tsf = + ieee80211_calculate_rx_timestamp(local, rx_status, + len + FCS_LEN, 24); + ether_addr_copy(bss_meta.parent_bssid, + scan_sdata->vif.bss_conf.bssid); + } + rcu_read_unlock(); + cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) @@ -345,6 +361,12 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (rc == 0) return; + + /* HW scan failed and is going to be reported as aborted, + * so clear old scan info. + */ + memset(&local->scan_info, 0, sizeof(local->scan_info)); + aborted = true; } kfree(local->hw_scan_req); @@ -353,8 +375,10 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); - if (scan_req != local->int_scan_req) - cfg80211_scan_done(scan_req, aborted); + if (scan_req != local->int_scan_req) { + local->scan_info.aborted = aborted; + cfg80211_scan_done(scan_req, &local->scan_info); + } RCU_INIT_POINTER(local->scan_req, NULL); scan_sdata = rcu_dereference_protected(local->scan_sdata, @@ -391,15 +415,19 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) ieee80211_start_next_roc(local); } -void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +void ieee80211_scan_completed(struct ieee80211_hw *hw, + struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); - trace_api_scan_completed(local, aborted); + trace_api_scan_completed(local, info); set_bit(SCAN_COMPLETED, &local->scanning); - if (aborted) + if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); + + memcpy(&local->scan_info, info, sizeof(*info)); + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); @@ -566,6 +594,9 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); + local->hw_scan_req->req.duration = req->duration; + local->hw_scan_req->req.duration_mandatory = + req->duration_mandatory; local->hw_scan_band = 0; @@ -1073,6 +1104,7 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) */ cancel_delayed_work(&local->scan_work); /* and clean up */ + memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); out: mutex_unlock(&local->mtx); diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 2ddc661f0..97f4c9d6b 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -129,42 +129,31 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, } if (wide_bw_chansw_ie) { - new_vht_chandef.chan = new_chan; - new_vht_chandef.center_freq1 = - ieee80211_channel_to_frequency( + struct ieee80211_vht_operation vht_oper = { + .chan_width = + wide_bw_chansw_ie->new_channel_width, + .center_freq_seg1_idx = wide_bw_chansw_ie->new_center_freq_seg0, - new_band); - - switch (wide_bw_chansw_ie->new_channel_width) { - default: - /* hmmm, ignore VHT and use HT if present */ - case IEEE80211_VHT_CHANWIDTH_USE_HT: + .center_freq_seg2_idx = + wide_bw_chansw_ie->new_center_freq_seg1, + /* .basic_mcs_set doesn't matter */ + }; + + /* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT, + * to the previously parsed chandef + */ + new_vht_chandef = csa_ie->chandef; + + /* ignore if parsing fails */ + if (!ieee80211_chandef_vht_oper(&vht_oper, &new_vht_chandef)) new_vht_chandef.chan = NULL; - break; - case IEEE80211_VHT_CHANWIDTH_80MHZ: - new_vht_chandef.width = NL80211_CHAN_WIDTH_80; - break; - case IEEE80211_VHT_CHANWIDTH_160MHZ: - new_vht_chandef.width = NL80211_CHAN_WIDTH_160; - break; - case IEEE80211_VHT_CHANWIDTH_80P80MHZ: - /* field is otherwise reserved */ - new_vht_chandef.center_freq2 = - ieee80211_channel_to_frequency( - wide_bw_chansw_ie->new_center_freq_seg1, - new_band); - new_vht_chandef.width = NL80211_CHAN_WIDTH_80P80; - break; - } + if (sta_flags & IEEE80211_STA_DISABLE_80P80MHZ && new_vht_chandef.width == NL80211_CHAN_WIDTH_80P80) ieee80211_chandef_downgrade(&new_vht_chandef); if (sta_flags & IEEE80211_STA_DISABLE_160MHZ && new_vht_chandef.width == NL80211_CHAN_WIDTH_160) ieee80211_chandef_downgrade(&new_vht_chandef); - if (sta_flags & IEEE80211_STA_DISABLE_40MHZ && - new_vht_chandef.width > NL80211_CHAN_WIDTH_20) - ieee80211_chandef_downgrade(&new_vht_chandef); } /* if VHT data is there validate & use it */ diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 5ccfdbd40..aa58df80e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -90,6 +90,7 @@ static void __cleanup_single_sta(struct sta_info *sta) struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; + struct fq *fq = &local->fq; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || @@ -113,11 +114,10 @@ static void __cleanup_single_sta(struct sta_info *sta) if (sta->sta.txq[0]) { for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); - int n = skb_queue_len(&txqi->queue); - ieee80211_purge_tx_queue(&local->hw, &txqi->queue); - atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]); - txqi->byte_cnt = 0; + spin_lock_bh(&fq->lock); + ieee80211_txq_purge(local, txqi); + spin_unlock_bh(&fq->lock); } } @@ -368,7 +368,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; - ieee80211_init_tx_queue(sdata, sta, txq, i); + ieee80211_txq_init(sdata, sta, txq, i); } } @@ -1211,7 +1211,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); - if (!skb_queue_len(&txqi->queue)) + if (!txqi->tin.backlog_packets) continue; drv_wake_tx_queue(local, txqi); @@ -1616,7 +1616,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, sta_info_recalc_tim(sta); } else { - unsigned long tids = sta->txq_buffered_tids & driver_release_tids; int tid; /* @@ -1648,7 +1647,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - if (!(tids & BIT(tid)) || skb_queue_len(&txqi->queue)) + if (!(driver_release_tids & BIT(tid)) || + txqi->tin.backlog_packets) continue; sta_info_recalc_tim(sta); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index c6d5c724e..a2a682696 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -771,6 +771,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) clear_sta_flag(sta, WLAN_STA_SP); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); + + /* mesh Peer Service Period support */ + if (ieee80211_vif_is_mesh(&sta->sdata->vif) && + ieee80211_is_data_qos(fc)) + ieee80211_mpsp_trigger_process( + ieee80211_get_qos_ctl(hdr), sta, true, acked); + if (!acked && test_sta_flag(sta, WLAN_STA_PS_STA)) { /* * The STA is in power save mode, so assume @@ -781,13 +788,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) return; } - /* mesh Peer Service Period support */ - if (ieee80211_vif_is_mesh(&sta->sdata->vif) && - ieee80211_is_data_qos(fc)) - ieee80211_mpsp_trigger_process( - ieee80211_get_qos_ctl(hdr), - sta, true, acked); - if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 1c7d45a6d..afca7d103 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -333,10 +333,11 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, if (!uc.center_freq1) return; - /* proceed to downgrade the chandef until usable or the same */ + /* proceed to downgrade the chandef until usable or the same as AP BW */ while (uc.width > max_width || - !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, - sdata->wdev.iftype)) + (uc.width > sta->tdls_chandef.width && + !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, + sdata->wdev.iftype))) ieee80211_chandef_downgrade(&uc); if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) { @@ -1747,6 +1748,7 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata, goto out; } + ret = 0; call_drv: drv_tdls_recv_channel_switch(sdata->local, sdata, ¶ms); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8bad2ad81..18b285e06 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -24,7 +24,10 @@ #include #include #include +#include +#include #include +#include #include "ieee80211_i.h" #include "driver-ops.h" @@ -590,6 +593,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) else if (tx->sta && (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) tx->key = key; + else if (ieee80211_is_group_privacy_action(tx->skb) && + (key = rcu_dereference(tx->sdata->default_multicast_key))) + tx->key = key; else if (ieee80211_is_mgmt(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && ieee80211_is_robust_mgmt_frame(tx->skb) && @@ -622,7 +628,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) case WLAN_CIPHER_SUITE_GCMP_256: if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, - tx->skb)) + tx->skb) && + !ieee80211_is_group_privacy_action(tx->skb)) tx->key = NULL; else skip_hw = (tx->key->conf.flags & @@ -789,6 +796,36 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) return ret; } +static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_txq *txq = NULL; + + if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || + (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) + return NULL; + + if (!ieee80211_is_data(hdr->frame_control)) + return NULL; + + if (pubsta) { + u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + + txq = pubsta->txq[tid]; + } else if (vif) { + txq = vif->txq; + } + + if (!txq) + return NULL; + + return to_txq_info(txq); +} + static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { @@ -846,7 +883,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) tid = *qc & IEEE80211_QOS_CTL_TID_MASK; tx->sta->tx_stats.msdu[tid]++; - if (!tx->sta->sta.txq[0]) + if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta, + tx->skb)) hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; @@ -1236,82 +1274,229 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } -static void ieee80211_drv_tx(struct ieee80211_local *local, - struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, +static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) +{ + IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); +} + +static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi) +{ + IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif; +} + +static u32 codel_skb_len_func(const struct sk_buff *skb) +{ + return skb->len; +} + +static codel_time_t codel_skb_time_func(const struct sk_buff *skb) +{ + const struct ieee80211_tx_info *info; + + info = (const struct ieee80211_tx_info *)skb->cb; + return info->control.enqueue_time; +} + +static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars, + void *ctx) +{ + struct ieee80211_local *local; + struct txq_info *txqi; + struct fq *fq; + struct fq_flow *flow; + + txqi = ctx; + local = vif_to_sdata(txqi->txq.vif)->local; + fq = &local->fq; + + if (cvars == &txqi->def_cvars) + flow = &txqi->def_flow; + else + flow = &fq->flows[cvars - local->cvars]; + + return fq_flow_dequeue(fq, flow); +} + +static void codel_drop_func(struct sk_buff *skb, + void *ctx) +{ + struct ieee80211_local *local; + struct ieee80211_hw *hw; + struct txq_info *txqi; + + txqi = ctx; + local = vif_to_sdata(txqi->txq.vif)->local; + hw = &local->hw; + + ieee80211_free_txskb(hw, skb); +} + +static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, + struct fq_tin *tin, + struct fq_flow *flow) +{ + struct ieee80211_local *local; + struct txq_info *txqi; + struct codel_vars *cvars; + struct codel_params *cparams; + struct codel_stats *cstats; + + local = container_of(fq, struct ieee80211_local, fq); + txqi = container_of(tin, struct txq_info, tin); + cparams = &local->cparams; + cstats = &local->cstats; + + if (flow == &txqi->def_flow) + cvars = &txqi->def_cvars; + else + cvars = &local->cvars[flow - fq->flows]; + + return codel_dequeue(txqi, + &flow->backlog, + cparams, + cvars, + cstats, + codel_skb_len_func, + codel_skb_time_func, + codel_drop_func, + codel_dequeue_func); +} + +static void fq_skb_free_func(struct fq *fq, + struct fq_tin *tin, + struct fq_flow *flow, struct sk_buff *skb) { - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_tx_control control = { - .sta = pubsta, - }; - struct ieee80211_txq *txq = NULL; + struct ieee80211_local *local; + + local = container_of(fq, struct ieee80211_local, fq); + ieee80211_free_txskb(&local->hw, skb); +} + +static struct fq_flow *fq_flow_get_default_func(struct fq *fq, + struct fq_tin *tin, + int idx, + struct sk_buff *skb) +{ struct txq_info *txqi; - u8 ac; - if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || - (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) - goto tx_normal; + txqi = container_of(tin, struct txq_info, tin); + return &txqi->def_flow; +} - if (!ieee80211_is_data(hdr->frame_control)) - goto tx_normal; +static void ieee80211_txq_enqueue(struct ieee80211_local *local, + struct txq_info *txqi, + struct sk_buff *skb) +{ + struct fq *fq = &local->fq; + struct fq_tin *tin = &txqi->tin; - if (pubsta) { - u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + ieee80211_set_skb_enqueue_time(skb); + fq_tin_enqueue(fq, tin, skb, + fq_skb_free_func, + fq_flow_get_default_func); +} - txq = pubsta->txq[tid]; - } else if (vif) { - txq = vif->txq; +void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct txq_info *txqi, int tid) +{ + fq_tin_init(&txqi->tin); + fq_flow_init(&txqi->def_flow); + codel_vars_init(&txqi->def_cvars); + + txqi->txq.vif = &sdata->vif; + + if (sta) { + txqi->txq.sta = &sta->sta; + sta->sta.txq[tid] = &txqi->txq; + txqi->txq.tid = tid; + txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; + } else { + sdata->vif.txq = &txqi->txq; + txqi->txq.tid = 0; + txqi->txq.ac = IEEE80211_AC_BE; } +} - if (!txq) - goto tx_normal; +void ieee80211_txq_purge(struct ieee80211_local *local, + struct txq_info *txqi) +{ + struct fq *fq = &local->fq; + struct fq_tin *tin = &txqi->tin; - ac = txq->ac; - txqi = to_txq_info(txq); - atomic_inc(&sdata->txqs_len[ac]); - if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending) - netif_stop_subqueue(sdata->dev, ac); + fq_tin_reset(fq, tin, fq_skb_free_func); +} + +int ieee80211_txq_setup_flows(struct ieee80211_local *local) +{ + struct fq *fq = &local->fq; + int ret; + int i; + + if (!local->ops->wake_tx_queue) + return 0; + + ret = fq_init(fq, 4096); + if (ret) + return ret; + + codel_params_init(&local->cparams); + codel_stats_init(&local->cstats); + local->cparams.interval = MS2TIME(100); + local->cparams.target = MS2TIME(20); + local->cparams.ecn = true; + + local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]), + GFP_KERNEL); + if (!local->cvars) { + spin_lock_bh(&fq->lock); + fq_reset(fq, fq_skb_free_func); + spin_unlock_bh(&fq->lock); + return -ENOMEM; + } + + for (i = 0; i < fq->flows_cnt; i++) + codel_vars_init(&local->cvars[i]); - spin_lock_bh(&txqi->queue.lock); - txqi->byte_cnt += skb->len; - __skb_queue_tail(&txqi->queue, skb); - spin_unlock_bh(&txqi->queue.lock); + return 0; +} + +void ieee80211_txq_teardown_flows(struct ieee80211_local *local) +{ + struct fq *fq = &local->fq; - drv_wake_tx_queue(local, txqi); + if (!local->ops->wake_tx_queue) + return; - return; + kfree(local->cvars); + local->cvars = NULL; -tx_normal: - drv_tx(local, &control, skb); + spin_lock_bh(&fq->lock); + fq_reset(fq, fq_skb_free_func); + spin_unlock_bh(&fq->lock); } struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); - struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); struct txq_info *txqi = container_of(txq, struct txq_info, txq); struct ieee80211_hdr *hdr; struct sk_buff *skb = NULL; - u8 ac = txq->ac; + struct fq *fq = &local->fq; + struct fq_tin *tin = &txqi->tin; - spin_lock_bh(&txqi->queue.lock); + spin_lock_bh(&fq->lock); if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) goto out; - skb = __skb_dequeue(&txqi->queue); + skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); if (!skb) goto out; - txqi->byte_cnt -= skb->len; - - atomic_dec(&sdata->txqs_len[ac]); - if (__netif_subqueue_stopped(sdata->dev, ac)) - ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]); + ieee80211_set_skb_vif(skb, txqi); hdr = (struct ieee80211_hdr *)skb->data; if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { @@ -1327,11 +1512,15 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, } out: - spin_unlock_bh(&txqi->queue.lock); + spin_unlock_bh(&fq->lock); if (skb && skb_has_frag_list(skb) && - !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) - skb_linearize(skb); + !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { + if (skb_linearize(skb)) { + ieee80211_free_txskb(&local->hw, skb); + return NULL; + } + } return skb; } @@ -1343,7 +1532,10 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, struct sk_buff_head *skbs, bool txpending) { + struct ieee80211_tx_control control = {}; + struct fq *fq = &local->fq; struct sk_buff *skb, *tmp; + struct txq_info *txqi; unsigned long flags; skb_queue_walk_safe(skbs, skb, tmp) { @@ -1358,6 +1550,21 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, } #endif + txqi = ieee80211_get_txq(local, vif, sta, skb); + if (txqi) { + info->control.vif = vif; + + __skb_unlink(skb, skbs); + + spin_lock_bh(&fq->lock); + ieee80211_txq_enqueue(local, txqi, skb); + spin_unlock_bh(&fq->lock); + + drv_wake_tx_queue(local, txqi); + + continue; + } + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { @@ -1400,9 +1607,10 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; + control.sta = sta; __skb_unlink(skb, skbs); - ieee80211_drv_tx(local, vif, sta, skb); + drv_tx(local, &control, skb); } return true; @@ -2882,6 +3090,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; + struct fq *fq = &local->fq; + struct fq_tin *tin; + struct fq_flow *flow; u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; struct ieee80211_txq *txq = sta->sta.txq[tid]; struct txq_info *txqi; @@ -2893,6 +3104,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, __be16 len; void *data; bool ret = false; + unsigned int orig_len; int n = 1, nfrags; if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) @@ -2909,12 +3121,20 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.max_rc_amsdu_len); - spin_lock_bh(&txqi->queue.lock); + spin_lock_bh(&fq->lock); + + /* TODO: Ideally aggregation should be done on dequeue to remain + * responsive to environment changes. + */ - head = skb_peek_tail(&txqi->queue); + tin = &txqi->tin; + flow = fq_flow_classify(fq, tin, skb, fq_flow_get_default_func); + head = skb_peek_tail(&flow->queue); if (!head) goto out; + orig_len = head->len; + if (skb->len + head->len > max_amsdu_len) goto out; @@ -2953,8 +3173,13 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, head->data_len += skb->len; *frag_tail = skb; + flow->backlog += head->len - orig_len; + tin->backlog_bytes += head->len - orig_len; + + fq_recalc_backlog(fq, tin, flow); + out: - spin_unlock_bh(&txqi->queue.lock); + spin_unlock_bh(&fq->lock); return ret; } @@ -3044,7 +3269,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { *ieee80211_get_qos_ctl(hdr) = tid; - if (!sta->sta.txq[0]) + if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb)) hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 905003f75..42bf0b668 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -244,6 +244,9 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; + if (local->ops->wake_tx_queue) + return; + if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; @@ -260,11 +263,6 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; - if (local->ops->wake_tx_queue && - (atomic_read(&sdata->txqs_len[ac]) > - local->hw.txq_ac_max_pending)) - continue; - if (ac_queue == queue || (sdata->vif.cab_queue == queue && local->queue_stop_reasons[ac_queue] == 0 && @@ -352,6 +350,9 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue])) return; + if (local->ops->wake_tx_queue) + return; + if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; @@ -3388,25 +3389,6 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) return buf; } -void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, - struct txq_info *txqi, int tid) -{ - skb_queue_head_init(&txqi->queue); - txqi->txq.vif = &sdata->vif; - - if (sta) { - txqi->txq.sta = &sta->sta; - sta->sta.txq[tid] = &txqi->txq; - txqi->txq.tid = tid; - txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; - } else { - sdata->vif.txq = &txqi->txq; - txqi->txq.tid = 0; - txqi->txq.ac = IEEE80211_AC_BE; - } -} - void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt) @@ -3414,9 +3396,9 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq, struct txq_info *txqi = to_txq_info(txq); if (frame_cnt) - *frame_cnt = txqi->queue.qlen; + *frame_cnt = txqi->tin.backlog_packets; if (byte_cnt) - *byte_cnt = txqi->byte_cnt; + *byte_cnt = txqi->tin.backlog_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 0b80a7140..5c161e775 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -91,7 +91,7 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->len <= mtu) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; @@ -1009,9 +1009,12 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, unsigned int flags; if (event == NETDEV_REGISTER) { - /* For now just support ethernet devices */ - if ((dev->type == ARPHRD_ETHER) || - (dev->type == ARPHRD_LOOPBACK)) { + /* For now just support Ethernet, IPGRE, SIT and IPIP devices */ + if (dev->type == ARPHRD_ETHER || + dev->type == ARPHRD_LOOPBACK || + dev->type == ARPHRD_IPGRE || + dev->type == ARPHRD_SIT || + dev->type == ARPHRD_TUNNEL) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) return notifier_from_errno(PTR_ERR(mdev)); diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig new file mode 100644 index 000000000..08a8a6031 --- /dev/null +++ b/net/ncsi/Kconfig @@ -0,0 +1,12 @@ +# +# Configuration for NCSI support +# + +config NET_NCSI + bool "NCSI interface support" + depends on INET + ---help--- + This module provides NCSI (Network Controller Sideband Interface) + support. Enable this only if your system connects to a network + device via NCSI and the ethernet driver you're using supports + the protocol explicitly. diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile new file mode 100644 index 000000000..dd12b564f --- /dev/null +++ b/net/ncsi/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for NCSI API +# +obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h new file mode 100644 index 000000000..33738c060 --- /dev/null +++ b/net/ncsi/internal.h @@ -0,0 +1,328 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NCSI_INTERNAL_H__ +#define __NCSI_INTERNAL_H__ + +enum { + NCSI_CAP_BASE = 0, + NCSI_CAP_GENERIC = 0, + NCSI_CAP_BC, + NCSI_CAP_MC, + NCSI_CAP_BUFFER, + NCSI_CAP_AEN, + NCSI_CAP_VLAN, + NCSI_CAP_MAX +}; + +enum { + NCSI_CAP_GENERIC_HWA = 0x01, /* HW arbitration */ + NCSI_CAP_GENERIC_HDS = 0x02, /* HNC driver status change */ + NCSI_CAP_GENERIC_FC = 0x04, /* HNC to MC flow control */ + NCSI_CAP_GENERIC_FC1 = 0x08, /* MC to HNC flow control */ + NCSI_CAP_GENERIC_MC = 0x10, /* Global MC filtering */ + NCSI_CAP_GENERIC_HWA_UNKNOWN = 0x00, /* Unknown HW arbitration */ + NCSI_CAP_GENERIC_HWA_SUPPORT = 0x20, /* Supported HW arbitration */ + NCSI_CAP_GENERIC_HWA_NOT_SUPPORT = 0x40, /* No HW arbitration */ + NCSI_CAP_GENERIC_HWA_RESERVED = 0x60, /* Reserved HW arbitration */ + NCSI_CAP_GENERIC_HWA_MASK = 0x60, /* Mask for HW arbitration */ + NCSI_CAP_GENERIC_MASK = 0x7f, + NCSI_CAP_BC_ARP = 0x01, /* ARP packet filtering */ + NCSI_CAP_BC_DHCPC = 0x02, /* DHCP client filtering */ + NCSI_CAP_BC_DHCPS = 0x04, /* DHCP server filtering */ + NCSI_CAP_BC_NETBIOS = 0x08, /* NetBIOS packet filtering */ + NCSI_CAP_BC_MASK = 0x0f, + NCSI_CAP_MC_IPV6_NEIGHBOR = 0x01, /* IPv6 neighbor filtering */ + NCSI_CAP_MC_IPV6_ROUTER = 0x02, /* IPv6 router filering */ + NCSI_CAP_MC_DHCPV6_RELAY = 0x04, /* DHCPv6 relay / server MC */ + NCSI_CAP_MC_DHCPV6_WELL_KNOWN = 0x08, /* DHCPv6 well-known MC */ + NCSI_CAP_MC_IPV6_MLD = 0x10, /* IPv6 MLD filtering */ + NCSI_CAP_MC_IPV6_NEIGHBOR_S = 0x20, /* IPv6 neighbour filtering */ + NCSI_CAP_MC_MASK = 0x3f, + NCSI_CAP_AEN_LSC = 0x01, /* Link status change */ + NCSI_CAP_AEN_CR = 0x02, /* Configuration required */ + NCSI_CAP_AEN_HDS = 0x04, /* HNC driver status */ + NCSI_CAP_AEN_MASK = 0x07, + NCSI_CAP_VLAN_ONLY = 0x01, /* Filter VLAN packet only */ + NCSI_CAP_VLAN_NO = 0x02, /* Filter VLAN and non-VLAN */ + NCSI_CAP_VLAN_ANY = 0x04, /* Filter Any-and-non-VLAN */ + NCSI_CAP_VLAN_MASK = 0x07 +}; + +enum { + NCSI_MODE_BASE = 0, + NCSI_MODE_ENABLE = 0, + NCSI_MODE_TX_ENABLE, + NCSI_MODE_LINK, + NCSI_MODE_VLAN, + NCSI_MODE_BC, + NCSI_MODE_MC, + NCSI_MODE_AEN, + NCSI_MODE_FC, + NCSI_MODE_MAX +}; + +enum { + NCSI_FILTER_BASE = 0, + NCSI_FILTER_VLAN = 0, + NCSI_FILTER_UC, + NCSI_FILTER_MC, + NCSI_FILTER_MIXED, + NCSI_FILTER_MAX +}; + +struct ncsi_channel_version { + u32 version; /* Supported BCD encoded NCSI version */ + u32 alpha2; /* Supported BCD encoded NCSI version */ + u8 fw_name[12]; /* Firware name string */ + u32 fw_version; /* Firmware version */ + u16 pci_ids[4]; /* PCI identification */ + u32 mf_id; /* Manufacture ID */ +}; + +struct ncsi_channel_cap { + u32 index; /* Index of channel capabilities */ + u32 cap; /* NCSI channel capability */ +}; + +struct ncsi_channel_mode { + u32 index; /* Index of channel modes */ + u32 enable; /* Enabled or disabled */ + u32 size; /* Valid entries in ncm_data[] */ + u32 data[8]; /* Data entries */ +}; + +struct ncsi_channel_filter { + u32 index; /* Index of channel filters */ + u32 total; /* Total entries in the filter table */ + u64 bitmap; /* Bitmap of valid entries */ + u32 data[]; /* Data for the valid entries */ +}; + +struct ncsi_channel_stats { + u32 hnc_cnt_hi; /* Counter cleared */ + u32 hnc_cnt_lo; /* Counter cleared */ + u32 hnc_rx_bytes; /* Rx bytes */ + u32 hnc_tx_bytes; /* Tx bytes */ + u32 hnc_rx_uc_pkts; /* Rx UC packets */ + u32 hnc_rx_mc_pkts; /* Rx MC packets */ + u32 hnc_rx_bc_pkts; /* Rx BC packets */ + u32 hnc_tx_uc_pkts; /* Tx UC packets */ + u32 hnc_tx_mc_pkts; /* Tx MC packets */ + u32 hnc_tx_bc_pkts; /* Tx BC packets */ + u32 hnc_fcs_err; /* FCS errors */ + u32 hnc_align_err; /* Alignment errors */ + u32 hnc_false_carrier; /* False carrier detection */ + u32 hnc_runt_pkts; /* Rx runt packets */ + u32 hnc_jabber_pkts; /* Rx jabber packets */ + u32 hnc_rx_pause_xon; /* Rx pause XON frames */ + u32 hnc_rx_pause_xoff; /* Rx XOFF frames */ + u32 hnc_tx_pause_xon; /* Tx XON frames */ + u32 hnc_tx_pause_xoff; /* Tx XOFF frames */ + u32 hnc_tx_s_collision; /* Single collision frames */ + u32 hnc_tx_m_collision; /* Multiple collision frames */ + u32 hnc_l_collision; /* Late collision frames */ + u32 hnc_e_collision; /* Excessive collision frames */ + u32 hnc_rx_ctl_frames; /* Rx control frames */ + u32 hnc_rx_64_frames; /* Rx 64-bytes frames */ + u32 hnc_rx_127_frames; /* Rx 65-127 bytes frames */ + u32 hnc_rx_255_frames; /* Rx 128-255 bytes frames */ + u32 hnc_rx_511_frames; /* Rx 256-511 bytes frames */ + u32 hnc_rx_1023_frames; /* Rx 512-1023 bytes frames */ + u32 hnc_rx_1522_frames; /* Rx 1024-1522 bytes frames */ + u32 hnc_rx_9022_frames; /* Rx 1523-9022 bytes frames */ + u32 hnc_tx_64_frames; /* Tx 64-bytes frames */ + u32 hnc_tx_127_frames; /* Tx 65-127 bytes frames */ + u32 hnc_tx_255_frames; /* Tx 128-255 bytes frames */ + u32 hnc_tx_511_frames; /* Tx 256-511 bytes frames */ + u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */ + u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */ + u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */ + u32 hnc_rx_valid_bytes; /* Rx valid bytes */ + u32 hnc_rx_runt_pkts; /* Rx error runt packets */ + u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */ + u32 ncsi_rx_cmds; /* Rx NCSI commands */ + u32 ncsi_dropped_cmds; /* Dropped commands */ + u32 ncsi_cmd_type_errs; /* Command type errors */ + u32 ncsi_cmd_csum_errs; /* Command checksum errors */ + u32 ncsi_rx_pkts; /* Rx NCSI packets */ + u32 ncsi_tx_pkts; /* Tx NCSI packets */ + u32 ncsi_tx_aen_pkts; /* Tx AEN packets */ + u32 pt_tx_pkts; /* Tx packets */ + u32 pt_tx_dropped; /* Tx dropped packets */ + u32 pt_tx_channel_err; /* Tx channel errors */ + u32 pt_tx_us_err; /* Tx undersize errors */ + u32 pt_rx_pkts; /* Rx packets */ + u32 pt_rx_dropped; /* Rx dropped packets */ + u32 pt_rx_channel_err; /* Rx channel errors */ + u32 pt_rx_us_err; /* Rx undersize errors */ + u32 pt_rx_os_err; /* Rx oversize errors */ +}; + +struct ncsi_dev_priv; +struct ncsi_package; + +#define NCSI_PACKAGE_SHIFT 5 +#define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7) +#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) +#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) + +struct ncsi_channel { + unsigned char id; + int state; +#define NCSI_CHANNEL_INACTIVE 1 +#define NCSI_CHANNEL_ACTIVE 2 +#define NCSI_CHANNEL_INVISIBLE 3 + spinlock_t lock; /* Protect filters etc */ + struct ncsi_package *package; + struct ncsi_channel_version version; + struct ncsi_channel_cap caps[NCSI_CAP_MAX]; + struct ncsi_channel_mode modes[NCSI_MODE_MAX]; + struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; + struct ncsi_channel_stats stats; + struct timer_list timer; /* Link monitor timer */ + bool enabled; /* Timer is enabled */ + unsigned int timeout; /* Times of timeout */ + struct list_head node; + struct list_head link; +}; + +struct ncsi_package { + unsigned char id; /* NCSI 3-bits package ID */ + unsigned char uuid[16]; /* UUID */ + struct ncsi_dev_priv *ndp; /* NCSI device */ + spinlock_t lock; /* Protect the package */ + unsigned int channel_num; /* Number of channels */ + struct list_head channels; /* List of chanels */ + struct list_head node; /* Form list of packages */ +}; + +struct ncsi_request { + unsigned char id; /* Request ID - 0 to 255 */ + bool used; /* Request that has been assigned */ + bool driven; /* Drive state machine */ + struct ncsi_dev_priv *ndp; /* Associated NCSI device */ + struct sk_buff *cmd; /* Associated NCSI command packet */ + struct sk_buff *rsp; /* Associated NCSI response packet */ + struct timer_list timer; /* Timer on waiting for response */ + bool enabled; /* Time has been enabled or not */ +}; + +enum { + ncsi_dev_state_major = 0xff00, + ncsi_dev_state_minor = 0x00ff, + ncsi_dev_state_probe_deselect = 0x0201, + ncsi_dev_state_probe_package, + ncsi_dev_state_probe_channel, + ncsi_dev_state_probe_cis, + ncsi_dev_state_probe_gvi, + ncsi_dev_state_probe_gc, + ncsi_dev_state_probe_gls, + ncsi_dev_state_probe_dp, + ncsi_dev_state_config_sp = 0x0301, + ncsi_dev_state_config_cis, + ncsi_dev_state_config_sma, + ncsi_dev_state_config_ebf, +#if IS_ENABLED(CONFIG_IPV6) + ncsi_dev_state_config_egmf, +#endif + ncsi_dev_state_config_ecnt, + ncsi_dev_state_config_ec, + ncsi_dev_state_config_ae, + ncsi_dev_state_config_gls, + ncsi_dev_state_config_done, + ncsi_dev_state_suspend_select = 0x0401, + ncsi_dev_state_suspend_dcnt, + ncsi_dev_state_suspend_dc, + ncsi_dev_state_suspend_deselect, + ncsi_dev_state_suspend_done +}; + +struct ncsi_dev_priv { + struct ncsi_dev ndev; /* Associated NCSI device */ + unsigned int flags; /* NCSI device flags */ +#define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */ +#define NCSI_DEV_HWA 2 /* Enabled HW arbitration */ +#define NCSI_DEV_RESHUFFLE 4 + spinlock_t lock; /* Protect the NCSI device */ +#if IS_ENABLED(CONFIG_IPV6) + unsigned int inet6_addr_num; /* Number of IPv6 addresses */ +#endif + unsigned int package_num; /* Number of packages */ + struct list_head packages; /* List of packages */ + struct ncsi_request requests[256]; /* Request table */ + unsigned int request_id; /* Last used request ID */ + unsigned int pending_req_num; /* Number of pending requests */ + struct ncsi_package *active_package; /* Currently handled package */ + struct ncsi_channel *active_channel; /* Currently handled channel */ + struct list_head channel_queue; /* Config queue of channels */ + struct work_struct work; /* For channel management */ + struct packet_type ptype; /* NCSI packet Rx handler */ + struct list_head node; /* Form NCSI device list */ +}; + +struct ncsi_cmd_arg { + struct ncsi_dev_priv *ndp; /* Associated NCSI device */ + unsigned char type; /* Command in the NCSI packet */ + unsigned char id; /* Request ID (sequence number) */ + unsigned char package; /* Destination package ID */ + unsigned char channel; /* Detination channel ID or 0x1f */ + unsigned short payload; /* Command packet payload length */ + bool driven; /* Drive the state machine? */ + union { + unsigned char bytes[16]; /* Command packet specific data */ + unsigned short words[8]; + unsigned int dwords[4]; + }; +}; + +extern struct list_head ncsi_dev_list; +extern spinlock_t ncsi_dev_lock; + +#define TO_NCSI_DEV_PRIV(nd) \ + container_of(nd, struct ncsi_dev_priv, ndev) +#define NCSI_FOR_EACH_DEV(ndp) \ + list_for_each_entry_rcu(ndp, &ncsi_dev_list, node) +#define NCSI_FOR_EACH_PACKAGE(ndp, np) \ + list_for_each_entry_rcu(np, &ndp->packages, node) +#define NCSI_FOR_EACH_CHANNEL(np, nc) \ + list_for_each_entry_rcu(nc, &np->channels, node) + +/* Resources */ +int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); +int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); +int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); +void ncsi_start_channel_monitor(struct ncsi_channel *nc); +void ncsi_stop_channel_monitor(struct ncsi_channel *nc); +struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, + unsigned char id); +struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, + unsigned char id); +struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp, + unsigned char id); +struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, + unsigned char id); +void ncsi_remove_package(struct ncsi_package *np); +void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, + unsigned char id, + struct ncsi_package **np, + struct ncsi_channel **nc); +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); +void ncsi_free_request(struct ncsi_request *nr); +struct ncsi_dev *ncsi_find_dev(struct net_device *dev); +int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); + +/* Packet handlers */ +u32 ncsi_calculate_checksum(unsigned char *data, int len); +int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca); +int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb); + +#endif /* __NCSI_INTERNAL_H__ */ diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c new file mode 100644 index 000000000..d46346844 --- /dev/null +++ b/net/ncsi/ncsi-aen.c @@ -0,0 +1,193 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "ncsi-pkt.h" + +static int ncsi_validate_aen_pkt(struct ncsi_aen_pkt_hdr *h, + const unsigned short payload) +{ + u32 checksum; + __be32 *pchecksum; + + if (h->common.revision != NCSI_PKT_REVISION) + return -EINVAL; + if (ntohs(h->common.length) != payload) + return -EINVAL; + + /* Validate checksum, which might be zeroes if the + * sender doesn't support checksum according to NCSI + * specification. + */ + pchecksum = (__be32 *)((void *)(h + 1) + payload - 4); + if (ntohl(*pchecksum) == 0) + return 0; + + checksum = ncsi_calculate_checksum((unsigned char *)h, + sizeof(*h) + payload - 4); + if (*pchecksum != htonl(checksum)) + return -EINVAL; + + return 0; +} + +static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, + struct ncsi_aen_pkt_hdr *h) +{ + struct ncsi_aen_lsc_pkt *lsc; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + unsigned long old_data; + unsigned long flags; + + /* Find the NCSI channel */ + ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update the link status */ + ncm = &nc->modes[NCSI_MODE_LINK]; + lsc = (struct ncsi_aen_lsc_pkt *)h; + old_data = ncm->data[2]; + ncm->data[2] = ntohl(lsc->status); + ncm->data[4] = ntohl(lsc->oem_status); + if (!((old_data ^ ncm->data[2]) & 0x1) || + !list_empty(&nc->link)) + return 0; + if (!(nc->state == NCSI_CHANNEL_INACTIVE && (ncm->data[2] & 0x1)) && + !(nc->state == NCSI_CHANNEL_ACTIVE && !(ncm->data[2] & 0x1))) + return 0; + + if (!(ndp->flags & NCSI_DEV_HWA) && + nc->state == NCSI_CHANNEL_ACTIVE) + ndp->flags |= NCSI_DEV_RESHUFFLE; + + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + return ncsi_process_next_channel(ndp); +} + +static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp, + struct ncsi_aen_pkt_hdr *h) +{ + struct ncsi_channel *nc; + unsigned long flags; + + /* Find the NCSI channel */ + ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); + if (!nc) + return -ENODEV; + + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_ACTIVE) + return 0; + + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + return ncsi_process_next_channel(ndp); +} + +static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp, + struct ncsi_aen_pkt_hdr *h) +{ + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + struct ncsi_aen_hncdsc_pkt *hncdsc; + unsigned long flags; + + /* Find the NCSI channel */ + ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); + if (!nc) + return -ENODEV; + + /* If the channel is active one, we need reconfigure it */ + ncm = &nc->modes[NCSI_MODE_LINK]; + hncdsc = (struct ncsi_aen_hncdsc_pkt *)h; + ncm->data[3] = ntohl(hncdsc->status); + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_ACTIVE || + (ncm->data[3] & 0x1)) + return 0; + + if (ndp->flags & NCSI_DEV_HWA) + ndp->flags |= NCSI_DEV_RESHUFFLE; + + /* If this channel is the active one and the link doesn't + * work, we have to choose another channel to be active one. + * The logic here is exactly similar to what we do when link + * is down on the active channel. + */ + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + ncsi_process_next_channel(ndp); + + return 0; +} + +static struct ncsi_aen_handler { + unsigned char type; + int payload; + int (*handler)(struct ncsi_dev_priv *ndp, + struct ncsi_aen_pkt_hdr *h); +} ncsi_aen_handlers[] = { + { NCSI_PKT_AEN_LSC, 12, ncsi_aen_handler_lsc }, + { NCSI_PKT_AEN_CR, 4, ncsi_aen_handler_cr }, + { NCSI_PKT_AEN_HNCDSC, 4, ncsi_aen_handler_hncdsc } +}; + +int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb) +{ + struct ncsi_aen_pkt_hdr *h; + struct ncsi_aen_handler *nah = NULL; + int i, ret; + + /* Find the handler */ + h = (struct ncsi_aen_pkt_hdr *)skb_network_header(skb); + for (i = 0; i < ARRAY_SIZE(ncsi_aen_handlers); i++) { + if (ncsi_aen_handlers[i].type == h->type) { + nah = &ncsi_aen_handlers[i]; + break; + } + } + + if (!nah) { + netdev_warn(ndp->ndev.dev, "Invalid AEN (0x%x) received\n", + h->type); + return -ENOENT; + } + + ret = ncsi_validate_aen_pkt(h, nah->payload); + if (ret) + goto out; + + ret = nah->handler(ndp, h); +out: + consume_skb(skb); + return ret; +} diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c new file mode 100644 index 000000000..21057a8ce --- /dev/null +++ b/net/ncsi/ncsi-cmd.c @@ -0,0 +1,367 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "ncsi-pkt.h" + +u32 ncsi_calculate_checksum(unsigned char *data, int len) +{ + u32 checksum = 0; + int i; + + for (i = 0; i < len; i += 2) + checksum += (((u32)data[i] << 8) | data[i + 1]); + + checksum = (~checksum + 1); + return checksum; +} + +/* This function should be called after the data area has been + * populated completely. + */ +static void ncsi_cmd_build_header(struct ncsi_pkt_hdr *h, + struct ncsi_cmd_arg *nca) +{ + u32 checksum; + __be32 *pchecksum; + + h->mc_id = 0; + h->revision = NCSI_PKT_REVISION; + h->reserved = 0; + h->id = nca->id; + h->type = nca->type; + h->channel = NCSI_TO_CHANNEL(nca->package, + nca->channel); + h->length = htons(nca->payload); + h->reserved1[0] = 0; + h->reserved1[1] = 0; + + /* Fill with calculated checksum */ + checksum = ncsi_calculate_checksum((unsigned char *)h, + sizeof(*h) + nca->payload); + pchecksum = (__be32 *)((void *)h + sizeof(struct ncsi_pkt_hdr) + + nca->payload); + *pchecksum = htonl(checksum); +} + +static int ncsi_cmd_handler_default(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_pkt *cmd; + + cmd = (struct ncsi_cmd_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_sp(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_sp_pkt *cmd; + + cmd = (struct ncsi_cmd_sp_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->hw_arbitration = nca->bytes[0]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_dc(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_dc_pkt *cmd; + + cmd = (struct ncsi_cmd_dc_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->ald = nca->bytes[0]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_rc(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_rc_pkt *cmd; + + cmd = (struct ncsi_cmd_rc_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_ae(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_ae_pkt *cmd; + + cmd = (struct ncsi_cmd_ae_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mc_id = nca->bytes[0]; + cmd->mode = htonl(nca->dwords[1]); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_sl(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_sl_pkt *cmd; + + cmd = (struct ncsi_cmd_sl_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = htonl(nca->dwords[0]); + cmd->oem_mode = htonl(nca->dwords[1]); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_svf(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_svf_pkt *cmd; + + cmd = (struct ncsi_cmd_svf_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->vlan = htons(nca->words[0]); + cmd->index = nca->bytes[2]; + cmd->enable = nca->bytes[3]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_ev(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_ev_pkt *cmd; + + cmd = (struct ncsi_cmd_ev_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = nca->bytes[0]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_sma(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_sma_pkt *cmd; + int i; + + cmd = (struct ncsi_cmd_sma_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + for (i = 0; i < 6; i++) + cmd->mac[i] = nca->bytes[i]; + cmd->index = nca->bytes[6]; + cmd->at_e = nca->bytes[7]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_ebf(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_ebf_pkt *cmd; + + cmd = (struct ncsi_cmd_ebf_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = htonl(nca->dwords[0]); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_egmf(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_egmf_pkt *cmd; + + cmd = (struct ncsi_cmd_egmf_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = htonl(nca->dwords[0]); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_snfc(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_snfc_pkt *cmd; + + cmd = (struct ncsi_cmd_snfc_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = nca->bytes[0]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static struct ncsi_cmd_handler { + unsigned char type; + int payload; + int (*handler)(struct sk_buff *skb, + struct ncsi_cmd_arg *nca); +} ncsi_cmd_handlers[] = { + { NCSI_PKT_CMD_CIS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_SP, 4, ncsi_cmd_handler_sp }, + { NCSI_PKT_CMD_DP, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_EC, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_DC, 4, ncsi_cmd_handler_dc }, + { NCSI_PKT_CMD_RC, 4, ncsi_cmd_handler_rc }, + { NCSI_PKT_CMD_ECNT, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_DCNT, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_AE, 8, ncsi_cmd_handler_ae }, + { NCSI_PKT_CMD_SL, 8, ncsi_cmd_handler_sl }, + { NCSI_PKT_CMD_GLS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_SVF, 4, ncsi_cmd_handler_svf }, + { NCSI_PKT_CMD_EV, 4, ncsi_cmd_handler_ev }, + { NCSI_PKT_CMD_DV, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_SMA, 8, ncsi_cmd_handler_sma }, + { NCSI_PKT_CMD_EBF, 4, ncsi_cmd_handler_ebf }, + { NCSI_PKT_CMD_DBF, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_EGMF, 4, ncsi_cmd_handler_egmf }, + { NCSI_PKT_CMD_DGMF, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_SNFC, 4, ncsi_cmd_handler_snfc }, + { NCSI_PKT_CMD_GVI, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GC, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GP, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GCPS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GNS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GNPTS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_OEM, 0, NULL }, + { NCSI_PKT_CMD_PLDM, 0, NULL }, + { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default } +}; + +static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) +{ + struct ncsi_dev_priv *ndp = nca->ndp; + struct ncsi_dev *nd = &ndp->ndev; + struct net_device *dev = nd->dev; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; + int len = hlen + tlen; + struct sk_buff *skb; + struct ncsi_request *nr; + + nr = ncsi_alloc_request(ndp, nca->driven); + if (!nr) + return NULL; + + /* NCSI command packet has 16-bytes header, payload, 4 bytes checksum. + * The packet needs padding if its payload is less than 26 bytes to + * meet 64 bytes minimal ethernet frame length. + */ + len += sizeof(struct ncsi_cmd_pkt_hdr) + 4; + if (nca->payload < 26) + len += 26; + else + len += nca->payload; + + /* Allocate skb */ + skb = alloc_skb(len, GFP_ATOMIC); + if (!skb) { + ncsi_free_request(nr); + return NULL; + } + + nr->cmd = skb; + skb_reserve(skb, hlen); + skb_reset_network_header(skb); + + skb->dev = dev; + skb->protocol = htons(ETH_P_NCSI); + + return nr; +} + +int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) +{ + struct ncsi_request *nr; + struct ethhdr *eh; + struct ncsi_cmd_handler *nch = NULL; + int i, ret; + + /* Search for the handler */ + for (i = 0; i < ARRAY_SIZE(ncsi_cmd_handlers); i++) { + if (ncsi_cmd_handlers[i].type == nca->type) { + if (ncsi_cmd_handlers[i].handler) + nch = &ncsi_cmd_handlers[i]; + else + nch = NULL; + + break; + } + } + + if (!nch) { + netdev_err(nca->ndp->ndev.dev, + "Cannot send packet with type 0x%02x\n", nca->type); + return -ENOENT; + } + + /* Get packet payload length and allocate the request */ + nca->payload = nch->payload; + nr = ncsi_alloc_command(nca); + if (!nr) + return -ENOMEM; + + /* Prepare the packet */ + nca->id = nr->id; + ret = nch->handler(nr->cmd, nca); + if (ret) { + ncsi_free_request(nr); + return ret; + } + + /* Fill the ethernet header */ + eh = (struct ethhdr *)skb_push(nr->cmd, sizeof(*eh)); + eh->h_proto = htons(ETH_P_NCSI); + eth_broadcast_addr(eh->h_dest); + eth_broadcast_addr(eh->h_source); + + /* Start the timer for the request that might not have + * corresponding response. Given NCSI is an internal + * connection a 1 second delay should be sufficient. + */ + nr->enabled = true; + mod_timer(&nr->timer, jiffies + 1 * HZ); + + /* Send NCSI packet */ + skb_get(nr->cmd); + ret = dev_queue_xmit(nr->cmd); + if (ret < 0) { + ncsi_free_request(nr); + return ret; + } + + return 0; +} diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c new file mode 100644 index 000000000..ef017b871 --- /dev/null +++ b/net/ncsi/ncsi-manage.c @@ -0,0 +1,1205 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "ncsi-pkt.h" + +LIST_HEAD(ncsi_dev_list); +DEFINE_SPINLOCK(ncsi_dev_lock); + +static inline int ncsi_filter_size(int table) +{ + int sizes[] = { 2, 6, 6, 6 }; + + BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX); + if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX) + return -EINVAL; + + return sizes[table]; +} + +int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data) +{ + struct ncsi_channel_filter *ncf; + void *bitmap; + int index, size; + unsigned long flags; + + ncf = nc->filters[table]; + if (!ncf) + return -ENXIO; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + index = -1; + while ((index = find_next_bit(bitmap, ncf->total, index + 1)) + < ncf->total) { + if (!memcmp(ncf->data + size * index, data, size)) { + spin_unlock_irqrestore(&nc->lock, flags); + return index; + } + } + spin_unlock_irqrestore(&nc->lock, flags); + + return -ENOENT; +} + +int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data) +{ + struct ncsi_channel_filter *ncf; + int index, size; + void *bitmap; + unsigned long flags; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + index = ncsi_find_filter(nc, table, data); + if (index >= 0) + return index; + + ncf = nc->filters[table]; + if (!ncf) + return -ENODEV; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + do { + index = find_next_zero_bit(bitmap, ncf->total, 0); + if (index >= ncf->total) { + spin_unlock_irqrestore(&nc->lock, flags); + return -ENOSPC; + } + } while (test_and_set_bit(index, bitmap)); + + memcpy(ncf->data + size * index, data, size); + spin_unlock_irqrestore(&nc->lock, flags); + + return index; +} + +int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index) +{ + struct ncsi_channel_filter *ncf; + int size; + void *bitmap; + unsigned long flags; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + ncf = nc->filters[table]; + if (!ncf || index >= ncf->total) + return -ENODEV; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + if (test_and_clear_bit(index, bitmap)) + memset(ncf->data + size * index, 0, size); + spin_unlock_irqrestore(&nc->lock, flags); + + return 0; +} + +static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np; + struct ncsi_channel *nc; + + nd->state = ncsi_dev_state_functional; + if (force_down) { + nd->link_up = 0; + goto report; + } + + nd->link_up = 0; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_ACTIVE) + continue; + + if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { + nd->link_up = 1; + goto report; + } + } + } + +report: + nd->handler(nd); +} + +static void ncsi_channel_monitor(unsigned long data) +{ + struct ncsi_channel *nc = (struct ncsi_channel *)data; + struct ncsi_package *np = nc->package; + struct ncsi_dev_priv *ndp = np->ndp; + struct ncsi_cmd_arg nca; + bool enabled; + unsigned int timeout; + unsigned long flags; + int ret; + + spin_lock_irqsave(&nc->lock, flags); + timeout = nc->timeout; + enabled = nc->enabled; + spin_unlock_irqrestore(&nc->lock, flags); + + if (!enabled || !list_empty(&nc->link)) + return; + if (nc->state != NCSI_CHANNEL_INACTIVE && + nc->state != NCSI_CHANNEL_ACTIVE) + return; + + if (!(timeout % 2)) { + nca.ndp = ndp; + nca.package = np->id; + nca.channel = nc->id; + nca.type = NCSI_PKT_CMD_GLS; + nca.driven = false; + ret = ncsi_xmit_cmd(&nca); + if (ret) { + netdev_err(ndp->ndev.dev, "Error %d sending GLS\n", + ret); + return; + } + } + + if (timeout + 1 >= 3) { + if (!(ndp->flags & NCSI_DEV_HWA) && + nc->state == NCSI_CHANNEL_ACTIVE) + ncsi_report_link(ndp, true); + + spin_lock_irqsave(&ndp->lock, flags); + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + ncsi_process_next_channel(ndp); + return; + } + + spin_lock_irqsave(&nc->lock, flags); + nc->timeout = timeout + 1; + nc->enabled = true; + spin_unlock_irqrestore(&nc->lock, flags); + mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); +} + +void ncsi_start_channel_monitor(struct ncsi_channel *nc) +{ + unsigned long flags; + + spin_lock_irqsave(&nc->lock, flags); + WARN_ON_ONCE(nc->enabled); + nc->timeout = 0; + nc->enabled = true; + spin_unlock_irqrestore(&nc->lock, flags); + + mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); +} + +void ncsi_stop_channel_monitor(struct ncsi_channel *nc) +{ + unsigned long flags; + + spin_lock_irqsave(&nc->lock, flags); + if (!nc->enabled) { + spin_unlock_irqrestore(&nc->lock, flags); + return; + } + nc->enabled = false; + spin_unlock_irqrestore(&nc->lock, flags); + + del_timer_sync(&nc->timer); +} + +struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, + unsigned char id) +{ + struct ncsi_channel *nc; + + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (nc->id == id) + return nc; + } + + return NULL; +} + +struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) +{ + struct ncsi_channel *nc, *tmp; + int index; + unsigned long flags; + + nc = kzalloc(sizeof(*nc), GFP_ATOMIC); + if (!nc) + return NULL; + + nc->id = id; + nc->package = np; + nc->state = NCSI_CHANNEL_INACTIVE; + nc->enabled = false; + setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc); + spin_lock_init(&nc->lock); + INIT_LIST_HEAD(&nc->link); + for (index = 0; index < NCSI_CAP_MAX; index++) + nc->caps[index].index = index; + for (index = 0; index < NCSI_MODE_MAX; index++) + nc->modes[index].index = index; + + spin_lock_irqsave(&np->lock, flags); + tmp = ncsi_find_channel(np, id); + if (tmp) { + spin_unlock_irqrestore(&np->lock, flags); + kfree(nc); + return tmp; + } + + list_add_tail_rcu(&nc->node, &np->channels); + np->channel_num++; + spin_unlock_irqrestore(&np->lock, flags); + + return nc; +} + +static void ncsi_remove_channel(struct ncsi_channel *nc) +{ + struct ncsi_package *np = nc->package; + struct ncsi_channel_filter *ncf; + unsigned long flags; + int i; + + /* Release filters */ + spin_lock_irqsave(&nc->lock, flags); + for (i = 0; i < NCSI_FILTER_MAX; i++) { + ncf = nc->filters[i]; + if (!ncf) + continue; + + nc->filters[i] = NULL; + kfree(ncf); + } + + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + ncsi_stop_channel_monitor(nc); + + /* Remove and free channel */ + spin_lock_irqsave(&np->lock, flags); + list_del_rcu(&nc->node); + np->channel_num--; + spin_unlock_irqrestore(&np->lock, flags); + + kfree(nc); +} + +struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp, + unsigned char id) +{ + struct ncsi_package *np; + + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (np->id == id) + return np; + } + + return NULL; +} + +struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, + unsigned char id) +{ + struct ncsi_package *np, *tmp; + unsigned long flags; + + np = kzalloc(sizeof(*np), GFP_ATOMIC); + if (!np) + return NULL; + + np->id = id; + np->ndp = ndp; + spin_lock_init(&np->lock); + INIT_LIST_HEAD(&np->channels); + + spin_lock_irqsave(&ndp->lock, flags); + tmp = ncsi_find_package(ndp, id); + if (tmp) { + spin_unlock_irqrestore(&ndp->lock, flags); + kfree(np); + return tmp; + } + + list_add_tail_rcu(&np->node, &ndp->packages); + ndp->package_num++; + spin_unlock_irqrestore(&ndp->lock, flags); + + return np; +} + +void ncsi_remove_package(struct ncsi_package *np) +{ + struct ncsi_dev_priv *ndp = np->ndp; + struct ncsi_channel *nc, *tmp; + unsigned long flags; + + /* Release all child channels */ + list_for_each_entry_safe(nc, tmp, &np->channels, node) + ncsi_remove_channel(nc); + + /* Remove and free package */ + spin_lock_irqsave(&ndp->lock, flags); + list_del_rcu(&np->node); + ndp->package_num--; + spin_unlock_irqrestore(&ndp->lock, flags); + + kfree(np); +} + +void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, + unsigned char id, + struct ncsi_package **np, + struct ncsi_channel **nc) +{ + struct ncsi_package *p; + struct ncsi_channel *c; + + p = ncsi_find_package(ndp, NCSI_PACKAGE_INDEX(id)); + c = p ? ncsi_find_channel(p, NCSI_CHANNEL_INDEX(id)) : NULL; + + if (np) + *np = p; + if (nc) + *nc = c; +} + +/* For two consecutive NCSI commands, the packet IDs shouldn't + * be same. Otherwise, the bogus response might be replied. So + * the available IDs are allocated in round-robin fashion. + */ +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) +{ + struct ncsi_request *nr = NULL; + int i, limit = ARRAY_SIZE(ndp->requests); + unsigned long flags; + + /* Check if there is one available request until the ceiling */ + spin_lock_irqsave(&ndp->lock, flags); + for (i = ndp->request_id; !nr && i < limit; i++) { + if (ndp->requests[i].used) + continue; + + nr = &ndp->requests[i]; + nr->used = true; + nr->driven = driven; + if (++ndp->request_id >= limit) + ndp->request_id = 0; + } + + /* Fail back to check from the starting cursor */ + for (i = 0; !nr && i < ndp->request_id; i++) { + if (ndp->requests[i].used) + continue; + + nr = &ndp->requests[i]; + nr->used = true; + nr->driven = driven; + if (++ndp->request_id >= limit) + ndp->request_id = 0; + } + spin_unlock_irqrestore(&ndp->lock, flags); + + return nr; +} + +void ncsi_free_request(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct sk_buff *cmd, *rsp; + unsigned long flags; + bool driven; + + if (nr->enabled) { + nr->enabled = false; + del_timer_sync(&nr->timer); + } + + spin_lock_irqsave(&ndp->lock, flags); + cmd = nr->cmd; + rsp = nr->rsp; + nr->cmd = NULL; + nr->rsp = NULL; + nr->used = false; + driven = nr->driven; + spin_unlock_irqrestore(&ndp->lock, flags); + + if (driven && cmd && --ndp->pending_req_num == 0) + schedule_work(&ndp->work); + + /* Release command and response */ + consume_skb(cmd); + consume_skb(rsp); +} + +struct ncsi_dev *ncsi_find_dev(struct net_device *dev) +{ + struct ncsi_dev_priv *ndp; + + NCSI_FOR_EACH_DEV(ndp) { + if (ndp->ndev.dev == dev) + return &ndp->ndev; + } + + return NULL; +} + +static void ncsi_request_timeout(unsigned long data) +{ + struct ncsi_request *nr = (struct ncsi_request *)data; + struct ncsi_dev_priv *ndp = nr->ndp; + unsigned long flags; + + /* If the request already had associated response, + * let the response handler to release it. + */ + spin_lock_irqsave(&ndp->lock, flags); + nr->enabled = false; + if (nr->rsp || !nr->cmd) { + spin_unlock_irqrestore(&ndp->lock, flags); + return; + } + spin_unlock_irqrestore(&ndp->lock, flags); + + /* Release the request */ + ncsi_free_request(nr); +} + +static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np = ndp->active_package; + struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_cmd_arg nca; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_suspend: + nd->state = ncsi_dev_state_suspend_select; + /* Fall through */ + case ncsi_dev_state_suspend_select: + case ncsi_dev_state_suspend_dcnt: + case ncsi_dev_state_suspend_dc: + case ncsi_dev_state_suspend_deselect: + ndp->pending_req_num = 1; + + np = ndp->active_package; + nc = ndp->active_channel; + nca.package = np->id; + if (nd->state == ncsi_dev_state_suspend_select) { + nca.type = NCSI_PKT_CMD_SP; + nca.channel = 0x1f; + if (ndp->flags & NCSI_DEV_HWA) + nca.bytes[0] = 0; + else + nca.bytes[0] = 1; + nd->state = ncsi_dev_state_suspend_dcnt; + } else if (nd->state == ncsi_dev_state_suspend_dcnt) { + nca.type = NCSI_PKT_CMD_DCNT; + nca.channel = nc->id; + nd->state = ncsi_dev_state_suspend_dc; + } else if (nd->state == ncsi_dev_state_suspend_dc) { + nca.type = NCSI_PKT_CMD_DC; + nca.channel = nc->id; + nca.bytes[0] = 1; + nd->state = ncsi_dev_state_suspend_deselect; + } else if (nd->state == ncsi_dev_state_suspend_deselect) { + nca.type = NCSI_PKT_CMD_DP; + nca.channel = 0x1f; + nd->state = ncsi_dev_state_suspend_done; + } + + ret = ncsi_xmit_cmd(&nca); + if (ret) { + nd->state = ncsi_dev_state_functional; + return; + } + + break; + case ncsi_dev_state_suspend_done: + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + ncsi_process_next_channel(ndp); + + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n", + nd->state); + } +} + +static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct net_device *dev = nd->dev; + struct ncsi_package *np = ndp->active_package; + struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_cmd_arg nca; + unsigned char index; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_config: + case ncsi_dev_state_config_sp: + ndp->pending_req_num = 1; + + /* Select the specific package */ + nca.type = NCSI_PKT_CMD_SP; + if (ndp->flags & NCSI_DEV_HWA) + nca.bytes[0] = 0; + else + nca.bytes[0] = 1; + nca.package = np->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_config_cis; + break; + case ncsi_dev_state_config_cis: + ndp->pending_req_num = 1; + + /* Clear initial state */ + nca.type = NCSI_PKT_CMD_CIS; + nca.package = np->id; + nca.channel = nc->id; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_config_sma; + break; + case ncsi_dev_state_config_sma: + case ncsi_dev_state_config_ebf: +#if IS_ENABLED(CONFIG_IPV6) + case ncsi_dev_state_config_egmf: +#endif + case ncsi_dev_state_config_ecnt: + case ncsi_dev_state_config_ec: + case ncsi_dev_state_config_ae: + case ncsi_dev_state_config_gls: + ndp->pending_req_num = 1; + + nca.package = np->id; + nca.channel = nc->id; + + /* Use first entry in unicast filter table. Note that + * the MAC filter table starts from entry 1 instead of + * 0. + */ + if (nd->state == ncsi_dev_state_config_sma) { + nca.type = NCSI_PKT_CMD_SMA; + for (index = 0; index < 6; index++) + nca.bytes[index] = dev->dev_addr[index]; + nca.bytes[6] = 0x1; + nca.bytes[7] = 0x1; + nd->state = ncsi_dev_state_config_ebf; + } else if (nd->state == ncsi_dev_state_config_ebf) { + nca.type = NCSI_PKT_CMD_EBF; + nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; + nd->state = ncsi_dev_state_config_ecnt; +#if IS_ENABLED(CONFIG_IPV6) + if (ndp->inet6_addr_num > 0 && + (nc->caps[NCSI_CAP_GENERIC].cap & + NCSI_CAP_GENERIC_MC)) + nd->state = ncsi_dev_state_config_egmf; + else + nd->state = ncsi_dev_state_config_ecnt; + } else if (nd->state == ncsi_dev_state_config_egmf) { + nca.type = NCSI_PKT_CMD_EGMF; + nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; + nd->state = ncsi_dev_state_config_ecnt; +#endif /* CONFIG_IPV6 */ + } else if (nd->state == ncsi_dev_state_config_ecnt) { + nca.type = NCSI_PKT_CMD_ECNT; + nd->state = ncsi_dev_state_config_ec; + } else if (nd->state == ncsi_dev_state_config_ec) { + /* Enable AEN if it's supported */ + nca.type = NCSI_PKT_CMD_EC; + nd->state = ncsi_dev_state_config_ae; + if (!(nc->caps[NCSI_CAP_AEN].cap & NCSI_CAP_AEN_MASK)) + nd->state = ncsi_dev_state_config_gls; + } else if (nd->state == ncsi_dev_state_config_ae) { + nca.type = NCSI_PKT_CMD_AE; + nca.bytes[0] = 0; + nca.dwords[1] = nc->caps[NCSI_CAP_AEN].cap; + nd->state = ncsi_dev_state_config_gls; + } else if (nd->state == ncsi_dev_state_config_gls) { + nca.type = NCSI_PKT_CMD_GLS; + nd->state = ncsi_dev_state_config_done; + } + + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + break; + case ncsi_dev_state_config_done: + if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) + xchg(&nc->state, NCSI_CHANNEL_ACTIVE); + else + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + + ncsi_start_channel_monitor(nc); + ncsi_process_next_channel(ndp); + break; + default: + netdev_warn(dev, "Wrong NCSI state 0x%x in config\n", + nd->state); + } + + return; + +error: + ncsi_report_link(ndp, true); +} + +static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc, *found; + struct ncsi_channel_mode *ncm; + unsigned long flags; + + /* The search is done once an inactive channel with up + * link is found. + */ + found = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_INACTIVE) + continue; + + if (!found) + found = nc; + + ncm = &nc->modes[NCSI_MODE_LINK]; + if (ncm->data[2] & 0x1) { + found = nc; + goto out; + } + } + } + + if (!found) { + ncsi_report_link(ndp, true); + return -ENODEV; + } + +out: + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&found->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + return ncsi_process_next_channel(ndp); +} + +static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned int cap; + + /* The hardware arbitration is disabled if any one channel + * doesn't support explicitly. + */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + cap = nc->caps[NCSI_CAP_GENERIC].cap; + if (!(cap & NCSI_CAP_GENERIC_HWA) || + (cap & NCSI_CAP_GENERIC_HWA_MASK) != + NCSI_CAP_GENERIC_HWA_SUPPORT) { + ndp->flags &= ~NCSI_DEV_HWA; + return false; + } + } + } + + ndp->flags |= NCSI_DEV_HWA; + return true; +} + +static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned long flags; + + /* Move all available channels to processing queue */ + spin_lock_irqsave(&ndp->lock, flags); + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE || + !list_empty(&nc->link)); + ncsi_stop_channel_monitor(nc); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + } + } + spin_unlock_irqrestore(&ndp->lock, flags); + + /* We can have no channels in extremely case */ + if (list_empty(&ndp->channel_queue)) { + ncsi_report_link(ndp, false); + return -ENOENT; + } + + return ncsi_process_next_channel(ndp); +} + +static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np; + struct ncsi_channel *nc; + struct ncsi_cmd_arg nca; + unsigned char index; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_probe: + nd->state = ncsi_dev_state_probe_deselect; + /* Fall through */ + case ncsi_dev_state_probe_deselect: + ndp->pending_req_num = 8; + + /* Deselect all possible packages */ + nca.type = NCSI_PKT_CMD_DP; + nca.channel = 0x1f; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_package; + break; + case ncsi_dev_state_probe_package: + ndp->pending_req_num = 16; + + /* Select all possible packages */ + nca.type = NCSI_PKT_CMD_SP; + nca.bytes[0] = 1; + nca.channel = 0x1f; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + /* Disable all possible packages */ + nca.type = NCSI_PKT_CMD_DP; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_channel; + break; + case ncsi_dev_state_probe_channel: + if (!ndp->active_package) + ndp->active_package = list_first_or_null_rcu( + &ndp->packages, struct ncsi_package, node); + else if (list_is_last(&ndp->active_package->node, + &ndp->packages)) + ndp->active_package = NULL; + else + ndp->active_package = list_next_entry( + ndp->active_package, node); + + /* All available packages and channels are enumerated. The + * enumeration happens for once when the NCSI interface is + * started. So we need continue to start the interface after + * the enumeration. + * + * We have to choose an active channel before configuring it. + * Note that we possibly don't have active channel in extreme + * situation. + */ + if (!ndp->active_package) { + ndp->flags |= NCSI_DEV_PROBED; + if (ncsi_check_hwa(ndp)) + ncsi_enable_hwa(ndp); + else + ncsi_choose_active_channel(ndp); + return; + } + + /* Select the active package */ + ndp->pending_req_num = 1; + nca.type = NCSI_PKT_CMD_SP; + nca.bytes[0] = 1; + nca.package = ndp->active_package->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_cis; + break; + case ncsi_dev_state_probe_cis: + ndp->pending_req_num = 32; + + /* Clear initial state */ + nca.type = NCSI_PKT_CMD_CIS; + nca.package = ndp->active_package->id; + for (index = 0; index < 0x20; index++) { + nca.channel = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_gvi; + break; + case ncsi_dev_state_probe_gvi: + case ncsi_dev_state_probe_gc: + case ncsi_dev_state_probe_gls: + np = ndp->active_package; + ndp->pending_req_num = np->channel_num; + + /* Retrieve version, capability or link status */ + if (nd->state == ncsi_dev_state_probe_gvi) + nca.type = NCSI_PKT_CMD_GVI; + else if (nd->state == ncsi_dev_state_probe_gc) + nca.type = NCSI_PKT_CMD_GC; + else + nca.type = NCSI_PKT_CMD_GLS; + + nca.package = np->id; + NCSI_FOR_EACH_CHANNEL(np, nc) { + nca.channel = nc->id; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + if (nd->state == ncsi_dev_state_probe_gvi) + nd->state = ncsi_dev_state_probe_gc; + else if (nd->state == ncsi_dev_state_probe_gc) + nd->state = ncsi_dev_state_probe_gls; + else + nd->state = ncsi_dev_state_probe_dp; + break; + case ncsi_dev_state_probe_dp: + ndp->pending_req_num = 1; + + /* Deselect the active package */ + nca.type = NCSI_PKT_CMD_DP; + nca.package = ndp->active_package->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + /* Scan channels in next package */ + nd->state = ncsi_dev_state_probe_channel; + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n", + nd->state); + } + + return; +error: + ncsi_report_link(ndp, true); +} + +static void ncsi_dev_work(struct work_struct *work) +{ + struct ncsi_dev_priv *ndp = container_of(work, + struct ncsi_dev_priv, work); + struct ncsi_dev *nd = &ndp->ndev; + + switch (nd->state & ncsi_dev_state_major) { + case ncsi_dev_state_probe: + ncsi_probe_channel(ndp); + break; + case ncsi_dev_state_suspend: + ncsi_suspend_channel(ndp); + break; + case ncsi_dev_state_config: + ncsi_configure_channel(ndp); + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%x in workqueue\n", + nd->state); + } +} + +int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_channel *nc; + int old_state; + unsigned long flags; + + spin_lock_irqsave(&ndp->lock, flags); + nc = list_first_or_null_rcu(&ndp->channel_queue, + struct ncsi_channel, link); + if (!nc) { + spin_unlock_irqrestore(&ndp->lock, flags); + goto out; + } + + old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE); + list_del_init(&nc->link); + + spin_unlock_irqrestore(&ndp->lock, flags); + + ndp->active_channel = nc; + ndp->active_package = nc->package; + + switch (old_state) { + case NCSI_CHANNEL_INACTIVE: + ndp->ndev.state = ncsi_dev_state_config; + ncsi_configure_channel(ndp); + break; + case NCSI_CHANNEL_ACTIVE: + ndp->ndev.state = ncsi_dev_state_suspend; + ncsi_suspend_channel(ndp); + break; + default: + netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n", + nc->state, nc->package->id, nc->id); + ncsi_report_link(ndp, false); + return -EINVAL; + } + + return 0; + +out: + ndp->active_channel = NULL; + ndp->active_package = NULL; + if (ndp->flags & NCSI_DEV_RESHUFFLE) { + ndp->flags &= ~NCSI_DEV_RESHUFFLE; + return ncsi_choose_active_channel(ndp); + } + + ncsi_report_link(ndp, false); + return -ENODEV; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int ncsi_inet6addr_event(struct notifier_block *this, + unsigned long event, void *data) +{ + struct inet6_ifaddr *ifa = data; + struct net_device *dev = ifa->idev->dev; + struct ncsi_dev *nd = ncsi_find_dev(dev); + struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; + struct ncsi_package *np; + struct ncsi_channel *nc; + struct ncsi_cmd_arg nca; + bool action; + int ret; + + if (!ndp || (ipv6_addr_type(&ifa->addr) & + (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK))) + return NOTIFY_OK; + + switch (event) { + case NETDEV_UP: + action = (++ndp->inet6_addr_num) == 1; + nca.type = NCSI_PKT_CMD_EGMF; + break; + case NETDEV_DOWN: + action = (--ndp->inet6_addr_num == 0); + nca.type = NCSI_PKT_CMD_DGMF; + break; + default: + return NOTIFY_OK; + } + + /* We might not have active channel or packages. The IPv6 + * required multicast will be enabled when active channel + * or packages are chosen. + */ + np = ndp->active_package; + nc = ndp->active_channel; + if (!action || !np || !nc) + return NOTIFY_OK; + + /* We needn't enable or disable it if the function isn't supported */ + if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) + return NOTIFY_OK; + + nca.ndp = ndp; + nca.driven = false; + nca.package = np->id; + nca.channel = nc->id; + nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; + ret = ncsi_xmit_cmd(&nca); + if (ret) { + netdev_warn(dev, "Fail to %s global multicast filter (%d)\n", + (event == NETDEV_UP) ? "enable" : "disable", ret); + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static struct notifier_block ncsi_inet6addr_notifier = { + .notifier_call = ncsi_inet6addr_event, +}; +#endif /* CONFIG_IPV6 */ + +struct ncsi_dev *ncsi_register_dev(struct net_device *dev, + void (*handler)(struct ncsi_dev *ndev)) +{ + struct ncsi_dev_priv *ndp; + struct ncsi_dev *nd; + unsigned long flags; + int i; + + /* Check if the device has been registered or not */ + nd = ncsi_find_dev(dev); + if (nd) + return nd; + + /* Create NCSI device */ + ndp = kzalloc(sizeof(*ndp), GFP_ATOMIC); + if (!ndp) + return NULL; + + nd = &ndp->ndev; + nd->state = ncsi_dev_state_registered; + nd->dev = dev; + nd->handler = handler; + ndp->pending_req_num = 0; + INIT_LIST_HEAD(&ndp->channel_queue); + INIT_WORK(&ndp->work, ncsi_dev_work); + + /* Initialize private NCSI device */ + spin_lock_init(&ndp->lock); + INIT_LIST_HEAD(&ndp->packages); + ndp->request_id = 0; + for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) { + ndp->requests[i].id = i; + ndp->requests[i].ndp = ndp; + setup_timer(&ndp->requests[i].timer, + ncsi_request_timeout, + (unsigned long)&ndp->requests[i]); + } + + spin_lock_irqsave(&ncsi_dev_lock, flags); +#if IS_ENABLED(CONFIG_IPV6) + ndp->inet6_addr_num = 0; + if (list_empty(&ncsi_dev_list)) + register_inet6addr_notifier(&ncsi_inet6addr_notifier); +#endif + list_add_tail_rcu(&ndp->node, &ncsi_dev_list); + spin_unlock_irqrestore(&ncsi_dev_lock, flags); + + /* Register NCSI packet Rx handler */ + ndp->ptype.type = cpu_to_be16(ETH_P_NCSI); + ndp->ptype.func = ncsi_rcv_rsp; + ndp->ptype.dev = dev; + dev_add_pack(&ndp->ptype); + + return nd; +} +EXPORT_SYMBOL_GPL(ncsi_register_dev); + +int ncsi_start_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np; + struct ncsi_channel *nc; + int old_state, ret; + + if (nd->state != ncsi_dev_state_registered && + nd->state != ncsi_dev_state_functional) + return -ENOTTY; + + if (!(ndp->flags & NCSI_DEV_PROBED)) { + nd->state = ncsi_dev_state_probe; + schedule_work(&ndp->work); + return 0; + } + + /* Reset channel's state and start over */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + WARN_ON_ONCE(!list_empty(&nc->link) || + old_state == NCSI_CHANNEL_INVISIBLE); + } + } + + if (ndp->flags & NCSI_DEV_HWA) + ret = ncsi_enable_hwa(ndp); + else + ret = ncsi_choose_active_channel(ndp); + + return ret; +} +EXPORT_SYMBOL_GPL(ncsi_start_dev); + +void ncsi_unregister_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np, *tmp; + unsigned long flags; + + dev_remove_pack(&ndp->ptype); + + list_for_each_entry_safe(np, tmp, &ndp->packages, node) + ncsi_remove_package(np); + + spin_lock_irqsave(&ncsi_dev_lock, flags); + list_del_rcu(&ndp->node); +#if IS_ENABLED(CONFIG_IPV6) + if (list_empty(&ncsi_dev_list)) + unregister_inet6addr_notifier(&ncsi_inet6addr_notifier); +#endif + spin_unlock_irqrestore(&ncsi_dev_lock, flags); + + kfree(ndp); +} +EXPORT_SYMBOL_GPL(ncsi_unregister_dev); diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h new file mode 100644 index 000000000..3ea49ed0a --- /dev/null +++ b/net/ncsi/ncsi-pkt.h @@ -0,0 +1,415 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NCSI_PKT_H__ +#define __NCSI_PKT_H__ + +struct ncsi_pkt_hdr { + unsigned char mc_id; /* Management controller ID */ + unsigned char revision; /* NCSI version - 0x01 */ + unsigned char reserved; /* Reserved */ + unsigned char id; /* Packet sequence number */ + unsigned char type; /* Packet type */ + unsigned char channel; /* Network controller ID */ + __be16 length; /* Payload length */ + __be32 reserved1[2]; /* Reserved */ +}; + +struct ncsi_cmd_pkt_hdr { + struct ncsi_pkt_hdr common; /* Common NCSI packet header */ +}; + +struct ncsi_rsp_pkt_hdr { + struct ncsi_pkt_hdr common; /* Common NCSI packet header */ + __be16 code; /* Response code */ + __be16 reason; /* Response reason */ +}; + +struct ncsi_aen_pkt_hdr { + struct ncsi_pkt_hdr common; /* Common NCSI packet header */ + unsigned char reserved2[3]; /* Reserved */ + unsigned char type; /* AEN packet type */ +}; + +/* NCSI common command packet */ +struct ncsi_cmd_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 checksum; /* Checksum */ + unsigned char pad[26]; +}; + +struct ncsi_rsp_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Select Package */ +struct ncsi_cmd_sp_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char hw_arbitration; /* HW arbitration */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Disable Channel */ +struct ncsi_cmd_dc_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char ald; /* Allow link down */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Reset Channel */ +struct ncsi_cmd_rc_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 reserved; /* Reserved */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* AEN Enable */ +struct ncsi_cmd_ae_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char mc_id; /* MC ID */ + __be32 mode; /* AEN working mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[18]; +}; + +/* Set Link */ +struct ncsi_cmd_sl_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 mode; /* Link working mode */ + __be32 oem_mode; /* OEM link mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[18]; +}; + +/* Set VLAN Filter */ +struct ncsi_cmd_svf_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be16 reserved; /* Reserved */ + __be16 vlan; /* VLAN ID */ + __be16 reserved1; /* Reserved */ + unsigned char index; /* VLAN table index */ + unsigned char enable; /* Enable or disable */ + __be32 checksum; /* Checksum */ + unsigned char pad[14]; +}; + +/* Enable VLAN */ +struct ncsi_cmd_ev_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char mode; /* VLAN filter mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Set MAC Address */ +struct ncsi_cmd_sma_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char mac[6]; /* MAC address */ + unsigned char index; /* MAC table index */ + unsigned char at_e; /* Addr type and operation */ + __be32 checksum; /* Checksum */ + unsigned char pad[18]; +}; + +/* Enable Broadcast Filter */ +struct ncsi_cmd_ebf_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 mode; /* Filter mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Enable Global Multicast Filter */ +struct ncsi_cmd_egmf_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 mode; /* Global MC mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Set NCSI Flow Control */ +struct ncsi_cmd_snfc_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char mode; /* Flow control mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Get Link Status */ +struct ncsi_rsp_gls_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 status; /* Link status */ + __be32 other; /* Other indications */ + __be32 oem_status; /* OEM link status */ + __be32 checksum; + unsigned char pad[10]; +}; + +/* Get Version ID */ +struct ncsi_rsp_gvi_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 ncsi_version; /* NCSI version */ + unsigned char reserved[3]; /* Reserved */ + unsigned char alpha2; /* NCSI version */ + unsigned char fw_name[12]; /* f/w name string */ + __be32 fw_version; /* f/w version */ + __be16 pci_ids[4]; /* PCI IDs */ + __be32 mf_id; /* Manufacture ID */ + __be32 checksum; +}; + +/* Get Capabilities */ +struct ncsi_rsp_gc_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 cap; /* Capabilities */ + __be32 bc_cap; /* Broadcast cap */ + __be32 mc_cap; /* Multicast cap */ + __be32 buf_cap; /* Buffering cap */ + __be32 aen_cap; /* AEN cap */ + unsigned char vlan_cnt; /* VLAN filter count */ + unsigned char mixed_cnt; /* Mix filter count */ + unsigned char mc_cnt; /* MC filter count */ + unsigned char uc_cnt; /* UC filter count */ + unsigned char reserved[2]; /* Reserved */ + unsigned char vlan_mode; /* VLAN mode */ + unsigned char channel_cnt; /* Channel count */ + __be32 checksum; /* Checksum */ +}; + +/* Get Parameters */ +struct ncsi_rsp_gp_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + unsigned char mac_cnt; /* Number of MAC addr */ + unsigned char reserved[2]; /* Reserved */ + unsigned char mac_enable; /* MAC addr enable flags */ + unsigned char vlan_cnt; /* VLAN tag count */ + unsigned char reserved1; /* Reserved */ + __be16 vlan_enable; /* VLAN tag enable flags */ + __be32 link_mode; /* Link setting */ + __be32 bc_mode; /* BC filter mode */ + __be32 valid_modes; /* Valid mode parameters */ + unsigned char vlan_mode; /* VLAN mode */ + unsigned char fc_mode; /* Flow control mode */ + unsigned char reserved2[2]; /* Reserved */ + __be32 aen_mode; /* AEN mode */ + unsigned char mac[6]; /* Supported MAC addr */ + __be16 vlan; /* Supported VLAN tags */ + __be32 checksum; /* Checksum */ +}; + +/* Get Controller Packet Statistics */ +struct ncsi_rsp_gcps_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 cnt_hi; /* Counter cleared */ + __be32 cnt_lo; /* Counter cleared */ + __be32 rx_bytes; /* Rx bytes */ + __be32 tx_bytes; /* Tx bytes */ + __be32 rx_uc_pkts; /* Rx UC packets */ + __be32 rx_mc_pkts; /* Rx MC packets */ + __be32 rx_bc_pkts; /* Rx BC packets */ + __be32 tx_uc_pkts; /* Tx UC packets */ + __be32 tx_mc_pkts; /* Tx MC packets */ + __be32 tx_bc_pkts; /* Tx BC packets */ + __be32 fcs_err; /* FCS errors */ + __be32 align_err; /* Alignment errors */ + __be32 false_carrier; /* False carrier detection */ + __be32 runt_pkts; /* Rx runt packets */ + __be32 jabber_pkts; /* Rx jabber packets */ + __be32 rx_pause_xon; /* Rx pause XON frames */ + __be32 rx_pause_xoff; /* Rx XOFF frames */ + __be32 tx_pause_xon; /* Tx XON frames */ + __be32 tx_pause_xoff; /* Tx XOFF frames */ + __be32 tx_s_collision; /* Single collision frames */ + __be32 tx_m_collision; /* Multiple collision frames */ + __be32 l_collision; /* Late collision frames */ + __be32 e_collision; /* Excessive collision frames */ + __be32 rx_ctl_frames; /* Rx control frames */ + __be32 rx_64_frames; /* Rx 64-bytes frames */ + __be32 rx_127_frames; /* Rx 65-127 bytes frames */ + __be32 rx_255_frames; /* Rx 128-255 bytes frames */ + __be32 rx_511_frames; /* Rx 256-511 bytes frames */ + __be32 rx_1023_frames; /* Rx 512-1023 bytes frames */ + __be32 rx_1522_frames; /* Rx 1024-1522 bytes frames */ + __be32 rx_9022_frames; /* Rx 1523-9022 bytes frames */ + __be32 tx_64_frames; /* Tx 64-bytes frames */ + __be32 tx_127_frames; /* Tx 65-127 bytes frames */ + __be32 tx_255_frames; /* Tx 128-255 bytes frames */ + __be32 tx_511_frames; /* Tx 256-511 bytes frames */ + __be32 tx_1023_frames; /* Tx 512-1023 bytes frames */ + __be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */ + __be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */ + __be32 rx_valid_bytes; /* Rx valid bytes */ + __be32 rx_runt_pkts; /* Rx error runt packets */ + __be32 rx_jabber_pkts; /* Rx error jabber packets */ + __be32 checksum; /* Checksum */ +}; + +/* Get NCSI Statistics */ +struct ncsi_rsp_gns_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 rx_cmds; /* Rx NCSI commands */ + __be32 dropped_cmds; /* Dropped commands */ + __be32 cmd_type_errs; /* Command type errors */ + __be32 cmd_csum_errs; /* Command checksum errors */ + __be32 rx_pkts; /* Rx NCSI packets */ + __be32 tx_pkts; /* Tx NCSI packets */ + __be32 tx_aen_pkts; /* Tx AEN packets */ + __be32 checksum; /* Checksum */ +}; + +/* Get NCSI Pass-through Statistics */ +struct ncsi_rsp_gnpts_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 tx_pkts; /* Tx packets */ + __be32 tx_dropped; /* Tx dropped packets */ + __be32 tx_channel_err; /* Tx channel errors */ + __be32 tx_us_err; /* Tx undersize errors */ + __be32 rx_pkts; /* Rx packets */ + __be32 rx_dropped; /* Rx dropped packets */ + __be32 rx_channel_err; /* Rx channel errors */ + __be32 rx_us_err; /* Rx undersize errors */ + __be32 rx_os_err; /* Rx oversize errors */ + __be32 checksum; /* Checksum */ +}; + +/* Get package status */ +struct ncsi_rsp_gps_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 status; /* Hardware arbitration status */ + __be32 checksum; +}; + +/* Get package UUID */ +struct ncsi_rsp_gpuuid_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + unsigned char uuid[16]; /* UUID */ + __be32 checksum; +}; + +/* AEN: Link State Change */ +struct ncsi_aen_lsc_pkt { + struct ncsi_aen_pkt_hdr aen; /* AEN header */ + __be32 status; /* Link status */ + __be32 oem_status; /* OEM link status */ + __be32 checksum; /* Checksum */ + unsigned char pad[14]; +}; + +/* AEN: Configuration Required */ +struct ncsi_aen_cr_pkt { + struct ncsi_aen_pkt_hdr aen; /* AEN header */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* AEN: Host Network Controller Driver Status Change */ +struct ncsi_aen_hncdsc_pkt { + struct ncsi_aen_pkt_hdr aen; /* AEN header */ + __be32 status; /* Status */ + __be32 checksum; /* Checksum */ + unsigned char pad[18]; +}; + +/* NCSI packet revision */ +#define NCSI_PKT_REVISION 0x01 + +/* NCSI packet commands */ +#define NCSI_PKT_CMD_CIS 0x00 /* Clear Initial State */ +#define NCSI_PKT_CMD_SP 0x01 /* Select Package */ +#define NCSI_PKT_CMD_DP 0x02 /* Deselect Package */ +#define NCSI_PKT_CMD_EC 0x03 /* Enable Channel */ +#define NCSI_PKT_CMD_DC 0x04 /* Disable Channel */ +#define NCSI_PKT_CMD_RC 0x05 /* Reset Channel */ +#define NCSI_PKT_CMD_ECNT 0x06 /* Enable Channel Network Tx */ +#define NCSI_PKT_CMD_DCNT 0x07 /* Disable Channel Network Tx */ +#define NCSI_PKT_CMD_AE 0x08 /* AEN Enable */ +#define NCSI_PKT_CMD_SL 0x09 /* Set Link */ +#define NCSI_PKT_CMD_GLS 0x0a /* Get Link */ +#define NCSI_PKT_CMD_SVF 0x0b /* Set VLAN Filter */ +#define NCSI_PKT_CMD_EV 0x0c /* Enable VLAN */ +#define NCSI_PKT_CMD_DV 0x0d /* Disable VLAN */ +#define NCSI_PKT_CMD_SMA 0x0e /* Set MAC address */ +#define NCSI_PKT_CMD_EBF 0x10 /* Enable Broadcast Filter */ +#define NCSI_PKT_CMD_DBF 0x11 /* Disable Broadcast Filter */ +#define NCSI_PKT_CMD_EGMF 0x12 /* Enable Global Multicast Filter */ +#define NCSI_PKT_CMD_DGMF 0x13 /* Disable Global Multicast Filter */ +#define NCSI_PKT_CMD_SNFC 0x14 /* Set NCSI Flow Control */ +#define NCSI_PKT_CMD_GVI 0x15 /* Get Version ID */ +#define NCSI_PKT_CMD_GC 0x16 /* Get Capabilities */ +#define NCSI_PKT_CMD_GP 0x17 /* Get Parameters */ +#define NCSI_PKT_CMD_GCPS 0x18 /* Get Controller Packet Statistics */ +#define NCSI_PKT_CMD_GNS 0x19 /* Get NCSI Statistics */ +#define NCSI_PKT_CMD_GNPTS 0x1a /* Get NCSI Pass-throu Statistics */ +#define NCSI_PKT_CMD_GPS 0x1b /* Get package status */ +#define NCSI_PKT_CMD_OEM 0x50 /* OEM */ +#define NCSI_PKT_CMD_PLDM 0x51 /* PLDM request over NCSI over RBT */ +#define NCSI_PKT_CMD_GPUUID 0x52 /* Get package UUID */ + +/* NCSI packet responses */ +#define NCSI_PKT_RSP_CIS (NCSI_PKT_CMD_CIS + 0x80) +#define NCSI_PKT_RSP_SP (NCSI_PKT_CMD_SP + 0x80) +#define NCSI_PKT_RSP_DP (NCSI_PKT_CMD_DP + 0x80) +#define NCSI_PKT_RSP_EC (NCSI_PKT_CMD_EC + 0x80) +#define NCSI_PKT_RSP_DC (NCSI_PKT_CMD_DC + 0x80) +#define NCSI_PKT_RSP_RC (NCSI_PKT_CMD_RC + 0x80) +#define NCSI_PKT_RSP_ECNT (NCSI_PKT_CMD_ECNT + 0x80) +#define NCSI_PKT_RSP_DCNT (NCSI_PKT_CMD_DCNT + 0x80) +#define NCSI_PKT_RSP_AE (NCSI_PKT_CMD_AE + 0x80) +#define NCSI_PKT_RSP_SL (NCSI_PKT_CMD_SL + 0x80) +#define NCSI_PKT_RSP_GLS (NCSI_PKT_CMD_GLS + 0x80) +#define NCSI_PKT_RSP_SVF (NCSI_PKT_CMD_SVF + 0x80) +#define NCSI_PKT_RSP_EV (NCSI_PKT_CMD_EV + 0x80) +#define NCSI_PKT_RSP_DV (NCSI_PKT_CMD_DV + 0x80) +#define NCSI_PKT_RSP_SMA (NCSI_PKT_CMD_SMA + 0x80) +#define NCSI_PKT_RSP_EBF (NCSI_PKT_CMD_EBF + 0x80) +#define NCSI_PKT_RSP_DBF (NCSI_PKT_CMD_DBF + 0x80) +#define NCSI_PKT_RSP_EGMF (NCSI_PKT_CMD_EGMF + 0x80) +#define NCSI_PKT_RSP_DGMF (NCSI_PKT_CMD_DGMF + 0x80) +#define NCSI_PKT_RSP_SNFC (NCSI_PKT_CMD_SNFC + 0x80) +#define NCSI_PKT_RSP_GVI (NCSI_PKT_CMD_GVI + 0x80) +#define NCSI_PKT_RSP_GC (NCSI_PKT_CMD_GC + 0x80) +#define NCSI_PKT_RSP_GP (NCSI_PKT_CMD_GP + 0x80) +#define NCSI_PKT_RSP_GCPS (NCSI_PKT_CMD_GCPS + 0x80) +#define NCSI_PKT_RSP_GNS (NCSI_PKT_CMD_GNS + 0x80) +#define NCSI_PKT_RSP_GNPTS (NCSI_PKT_CMD_GNPTS + 0x80) +#define NCSI_PKT_RSP_GPS (NCSI_PKT_CMD_GPS + 0x80) +#define NCSI_PKT_RSP_OEM (NCSI_PKT_CMD_OEM + 0x80) +#define NCSI_PKT_RSP_PLDM (NCSI_PKT_CMD_PLDM + 0x80) +#define NCSI_PKT_RSP_GPUUID (NCSI_PKT_CMD_GPUUID + 0x80) + +/* NCSI response code/reason */ +#define NCSI_PKT_RSP_C_COMPLETED 0x0000 /* Command Completed */ +#define NCSI_PKT_RSP_C_FAILED 0x0001 /* Command Failed */ +#define NCSI_PKT_RSP_C_UNAVAILABLE 0x0002 /* Command Unavailable */ +#define NCSI_PKT_RSP_C_UNSUPPORTED 0x0003 /* Command Unsupported */ +#define NCSI_PKT_RSP_R_NO_ERROR 0x0000 /* No Error */ +#define NCSI_PKT_RSP_R_INTERFACE 0x0001 /* Interface not ready */ +#define NCSI_PKT_RSP_R_PARAM 0x0002 /* Invalid Parameter */ +#define NCSI_PKT_RSP_R_CHANNEL 0x0003 /* Channel not Ready */ +#define NCSI_PKT_RSP_R_PACKAGE 0x0004 /* Package not Ready */ +#define NCSI_PKT_RSP_R_LENGTH 0x0005 /* Invalid payload length */ +#define NCSI_PKT_RSP_R_UNKNOWN 0x7fff /* Command type unsupported */ + +/* NCSI AEN packet type */ +#define NCSI_PKT_AEN 0xFF /* AEN Packet */ +#define NCSI_PKT_AEN_LSC 0x00 /* Link status change */ +#define NCSI_PKT_AEN_CR 0x01 /* Configuration required */ +#define NCSI_PKT_AEN_HNCDSC 0x02 /* HNC driver status change */ + +#endif /* __NCSI_PKT_H__ */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c new file mode 100644 index 000000000..af84389a6 --- /dev/null +++ b/net/ncsi/ncsi-rsp.c @@ -0,0 +1,1035 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "ncsi-pkt.h" + +static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, + unsigned short payload) +{ + struct ncsi_rsp_pkt_hdr *h; + u32 checksum; + __be32 *pchecksum; + + /* Check NCSI packet header. We don't need validate + * the packet type, which should have been checked + * before calling this function. + */ + h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp); + if (h->common.revision != NCSI_PKT_REVISION) + return -EINVAL; + if (ntohs(h->common.length) != payload) + return -EINVAL; + + /* Check on code and reason */ + if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED || + ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) + return -EINVAL; + + /* Validate checksum, which might be zeroes if the + * sender doesn't support checksum according to NCSI + * specification. + */ + pchecksum = (__be32 *)((void *)(h + 1) + payload - 4); + if (ntohl(*pchecksum) == 0) + return 0; + + checksum = ncsi_calculate_checksum((unsigned char *)h, + sizeof(*h) + payload - 4); + if (*pchecksum != htonl(checksum)) + return -EINVAL; + + return 0; +} + +static int ncsi_rsp_handler_cis(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned char id; + + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, &np, &nc); + if (!nc) { + if (ndp->flags & NCSI_DEV_PROBED) + return -ENXIO; + + id = NCSI_CHANNEL_INDEX(rsp->rsp.common.channel); + nc = ncsi_add_channel(np, id); + } + + return nc ? 0 : -ENODEV; +} + +static int ncsi_rsp_handler_sp(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + unsigned char id; + + /* Add the package if it's not existing. Otherwise, + * to change the state of its child channels. + */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, NULL); + if (!np) { + if (ndp->flags & NCSI_DEV_PROBED) + return -ENXIO; + + id = NCSI_PACKAGE_INDEX(rsp->rsp.common.channel); + np = ncsi_add_package(ndp, id); + if (!np) + return -ENODEV; + } + + return 0; +} + +static int ncsi_rsp_handler_dp(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned long flags; + + /* Find the package */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, NULL); + if (!np) + return -ENODEV; + + /* Change state of all channels attached to the package */ + NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + } + + return 0; +} + +static int ncsi_rsp_handler_ec(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_ENABLE]; + if (ncm->enable) + return -EBUSY; + + ncm->enable = 1; + return 0; +} + +static int ncsi_rsp_handler_dc(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + int ret; + + ret = ncsi_validate_rsp_pkt(nr, 4); + if (ret) + return ret; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_ENABLE]; + if (!ncm->enable) + return -EBUSY; + + ncm->enable = 0; + return 0; +} + +static int ncsi_rsp_handler_rc(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + unsigned long flags; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update state for the specified channel */ + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + + return 0; +} + +static int ncsi_rsp_handler_ecnt(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; + if (ncm->enable) + return -EBUSY; + + ncm->enable = 1; + return 0; +} + +static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; + if (!ncm->enable) + return -EBUSY; + + ncm->enable = 1; + return 0; +} + +static int ncsi_rsp_handler_ae(struct ncsi_request *nr) +{ + struct ncsi_cmd_ae_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if the AEN has been enabled */ + ncm = &nc->modes[NCSI_MODE_AEN]; + if (ncm->enable) + return -EBUSY; + + /* Update to AEN configuration */ + cmd = (struct ncsi_cmd_ae_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = cmd->mc_id; + ncm->data[1] = ntohl(cmd->mode); + + return 0; +} + +static int ncsi_rsp_handler_sl(struct ncsi_request *nr) +{ + struct ncsi_cmd_sl_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + cmd = (struct ncsi_cmd_sl_pkt *)skb_network_header(nr->cmd); + ncm = &nc->modes[NCSI_MODE_LINK]; + ncm->data[0] = ntohl(cmd->mode); + ncm->data[1] = ntohl(cmd->oem_mode); + + return 0; +} + +static int ncsi_rsp_handler_gls(struct ncsi_request *nr) +{ + struct ncsi_rsp_gls_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + unsigned long flags; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_gls_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_LINK]; + ncm->data[2] = ntohl(rsp->status); + ncm->data[3] = ntohl(rsp->other); + ncm->data[4] = ntohl(rsp->oem_status); + + if (nr->driven) + return 0; + + /* Reset the channel monitor if it has been enabled */ + spin_lock_irqsave(&nc->lock, flags); + nc->timeout = 0; + spin_unlock_irqrestore(&nc->lock, flags); + + return 0; +} + +static int ncsi_rsp_handler_svf(struct ncsi_request *nr) +{ + struct ncsi_cmd_svf_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_filter *ncf; + unsigned short vlan; + int ret; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd); + ncf = nc->filters[NCSI_FILTER_VLAN]; + if (!ncf) + return -ENOENT; + if (cmd->index >= ncf->total) + return -ERANGE; + + /* Add or remove the VLAN filter */ + if (!(cmd->enable & 0x1)) { + ret = ncsi_remove_filter(nc, NCSI_FILTER_VLAN, cmd->index); + } else { + vlan = ntohs(cmd->vlan); + ret = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan); + } + + return ret; +} + +static int ncsi_rsp_handler_ev(struct ncsi_request *nr) +{ + struct ncsi_cmd_ev_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if VLAN mode has been enabled */ + ncm = &nc->modes[NCSI_MODE_VLAN]; + if (ncm->enable) + return -EBUSY; + + /* Update to VLAN mode */ + cmd = (struct ncsi_cmd_ev_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = ntohl(cmd->mode); + + return 0; +} + +static int ncsi_rsp_handler_dv(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if VLAN mode has been enabled */ + ncm = &nc->modes[NCSI_MODE_VLAN]; + if (!ncm->enable) + return -EBUSY; + + /* Update to VLAN mode */ + ncm->enable = 0; + return 0; +} + +static int ncsi_rsp_handler_sma(struct ncsi_request *nr) +{ + struct ncsi_cmd_sma_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_filter *ncf; + void *bitmap; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* According to NCSI spec 1.01, the mixed filter table + * isn't supported yet. + */ + cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd); + switch (cmd->at_e >> 5) { + case 0x0: /* UC address */ + ncf = nc->filters[NCSI_FILTER_UC]; + break; + case 0x1: /* MC address */ + ncf = nc->filters[NCSI_FILTER_MC]; + break; + default: + return -EINVAL; + } + + /* Sanity check on the filter */ + if (!ncf) + return -ENOENT; + else if (cmd->index >= ncf->total) + return -ERANGE; + + bitmap = &ncf->bitmap; + if (cmd->at_e & 0x1) { + if (test_and_set_bit(cmd->index, bitmap)) + return -EBUSY; + memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6); + } else { + if (!test_and_clear_bit(cmd->index, bitmap)) + return -EBUSY; + + memset(ncf->data + 6 * cmd->index, 0, 6); + } + + return 0; +} + +static int ncsi_rsp_handler_ebf(struct ncsi_request *nr) +{ + struct ncsi_cmd_ebf_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if broadcast filter has been enabled */ + ncm = &nc->modes[NCSI_MODE_BC]; + if (ncm->enable) + return -EBUSY; + + /* Update to broadcast filter mode */ + cmd = (struct ncsi_cmd_ebf_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = ntohl(cmd->mode); + + return 0; +} + +static int ncsi_rsp_handler_dbf(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if broadcast filter isn't enabled */ + ncm = &nc->modes[NCSI_MODE_BC]; + if (!ncm->enable) + return -EBUSY; + + /* Update to broadcast filter mode */ + ncm->enable = 0; + ncm->data[0] = 0; + + return 0; +} + +static int ncsi_rsp_handler_egmf(struct ncsi_request *nr) +{ + struct ncsi_cmd_egmf_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if multicast filter has been enabled */ + ncm = &nc->modes[NCSI_MODE_MC]; + if (ncm->enable) + return -EBUSY; + + /* Update to multicast filter mode */ + cmd = (struct ncsi_cmd_egmf_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = ntohl(cmd->mode); + + return 0; +} + +static int ncsi_rsp_handler_dgmf(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if multicast filter has been enabled */ + ncm = &nc->modes[NCSI_MODE_MC]; + if (!ncm->enable) + return -EBUSY; + + /* Update to multicast filter mode */ + ncm->enable = 0; + ncm->data[0] = 0; + + return 0; +} + +static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) +{ + struct ncsi_cmd_snfc_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if flow control has been enabled */ + ncm = &nc->modes[NCSI_MODE_FC]; + if (ncm->enable) + return -EBUSY; + + /* Update to flow control mode */ + cmd = (struct ncsi_cmd_snfc_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = cmd->mode; + + return 0; +} + +static int ncsi_rsp_handler_gvi(struct ncsi_request *nr) +{ + struct ncsi_rsp_gvi_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_version *ncv; + int i; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gvi_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update to channel's version info */ + ncv = &nc->version; + ncv->version = ntohl(rsp->ncsi_version); + ncv->alpha2 = rsp->alpha2; + memcpy(ncv->fw_name, rsp->fw_name, 12); + ncv->fw_version = ntohl(rsp->fw_version); + for (i = 0; i < ARRAY_SIZE(ncv->pci_ids); i++) + ncv->pci_ids[i] = ntohs(rsp->pci_ids[i]); + ncv->mf_id = ntohl(rsp->mf_id); + + return 0; +} + +static int ncsi_rsp_handler_gc(struct ncsi_request *nr) +{ + struct ncsi_rsp_gc_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_filter *ncf; + size_t size, entry_size; + int cnt, i; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update channel's capabilities */ + nc->caps[NCSI_CAP_GENERIC].cap = ntohl(rsp->cap) & + NCSI_CAP_GENERIC_MASK; + nc->caps[NCSI_CAP_BC].cap = ntohl(rsp->bc_cap) & + NCSI_CAP_BC_MASK; + nc->caps[NCSI_CAP_MC].cap = ntohl(rsp->mc_cap) & + NCSI_CAP_MC_MASK; + nc->caps[NCSI_CAP_BUFFER].cap = ntohl(rsp->buf_cap); + nc->caps[NCSI_CAP_AEN].cap = ntohl(rsp->aen_cap) & + NCSI_CAP_AEN_MASK; + nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode & + NCSI_CAP_VLAN_MASK; + + /* Build filters */ + for (i = 0; i < NCSI_FILTER_MAX; i++) { + switch (i) { + case NCSI_FILTER_VLAN: + cnt = rsp->vlan_cnt; + entry_size = 2; + break; + case NCSI_FILTER_MIXED: + cnt = rsp->mixed_cnt; + entry_size = 6; + break; + case NCSI_FILTER_MC: + cnt = rsp->mc_cnt; + entry_size = 6; + break; + case NCSI_FILTER_UC: + cnt = rsp->uc_cnt; + entry_size = 6; + break; + default: + continue; + } + + if (!cnt || nc->filters[i]) + continue; + + size = sizeof(*ncf) + cnt * entry_size; + ncf = kzalloc(size, GFP_ATOMIC); + if (!ncf) { + pr_warn("%s: Cannot alloc filter table (%d)\n", + __func__, i); + return -ENOMEM; + } + + ncf->index = i; + ncf->total = cnt; + ncf->bitmap = 0x0ul; + nc->filters[i] = ncf; + } + + return 0; +} + +static int ncsi_rsp_handler_gp(struct ncsi_request *nr) +{ + struct ncsi_rsp_gp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + unsigned short enable, vlan; + unsigned char *pdata; + int table, i; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Modes with explicit enabled indications */ + if (ntohl(rsp->valid_modes) & 0x1) { /* BC filter mode */ + nc->modes[NCSI_MODE_BC].enable = 1; + nc->modes[NCSI_MODE_BC].data[0] = ntohl(rsp->bc_mode); + } + if (ntohl(rsp->valid_modes) & 0x2) /* Channel enabled */ + nc->modes[NCSI_MODE_ENABLE].enable = 1; + if (ntohl(rsp->valid_modes) & 0x4) /* Channel Tx enabled */ + nc->modes[NCSI_MODE_TX_ENABLE].enable = 1; + if (ntohl(rsp->valid_modes) & 0x8) /* MC filter mode */ + nc->modes[NCSI_MODE_MC].enable = 1; + + /* Modes without explicit enabled indications */ + nc->modes[NCSI_MODE_LINK].enable = 1; + nc->modes[NCSI_MODE_LINK].data[0] = ntohl(rsp->link_mode); + nc->modes[NCSI_MODE_VLAN].enable = 1; + nc->modes[NCSI_MODE_VLAN].data[0] = rsp->vlan_mode; + nc->modes[NCSI_MODE_FC].enable = 1; + nc->modes[NCSI_MODE_FC].data[0] = rsp->fc_mode; + nc->modes[NCSI_MODE_AEN].enable = 1; + nc->modes[NCSI_MODE_AEN].data[0] = ntohl(rsp->aen_mode); + + /* MAC addresses filter table */ + pdata = (unsigned char *)rsp + 48; + enable = rsp->mac_enable; + for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) { + if (i >= (nc->filters[NCSI_FILTER_UC]->total + + nc->filters[NCSI_FILTER_MC]->total)) + table = NCSI_FILTER_MIXED; + else if (i >= nc->filters[NCSI_FILTER_UC]->total) + table = NCSI_FILTER_MC; + else + table = NCSI_FILTER_UC; + + if (!(enable & (0x1 << i))) + continue; + + if (ncsi_find_filter(nc, table, pdata) >= 0) + continue; + + ncsi_add_filter(nc, table, pdata); + } + + /* VLAN filter table */ + enable = ntohs(rsp->vlan_enable); + for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) { + if (!(enable & (0x1 << i))) + continue; + + vlan = ntohs(*(__be16 *)pdata); + if (ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan) >= 0) + continue; + + ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan); + } + + return 0; +} + +static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) +{ + struct ncsi_rsp_gcps_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_stats *ncs; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gcps_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update HNC's statistics */ + ncs = &nc->stats; + ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi); + ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo); + ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes); + ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes); + ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts); + ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts); + ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts); + ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts); + ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts); + ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts); + ncs->hnc_fcs_err = ntohl(rsp->fcs_err); + ncs->hnc_align_err = ntohl(rsp->align_err); + ncs->hnc_false_carrier = ntohl(rsp->false_carrier); + ncs->hnc_runt_pkts = ntohl(rsp->runt_pkts); + ncs->hnc_jabber_pkts = ntohl(rsp->jabber_pkts); + ncs->hnc_rx_pause_xon = ntohl(rsp->rx_pause_xon); + ncs->hnc_rx_pause_xoff = ntohl(rsp->rx_pause_xoff); + ncs->hnc_tx_pause_xon = ntohl(rsp->tx_pause_xon); + ncs->hnc_tx_pause_xoff = ntohl(rsp->tx_pause_xoff); + ncs->hnc_tx_s_collision = ntohl(rsp->tx_s_collision); + ncs->hnc_tx_m_collision = ntohl(rsp->tx_m_collision); + ncs->hnc_l_collision = ntohl(rsp->l_collision); + ncs->hnc_e_collision = ntohl(rsp->e_collision); + ncs->hnc_rx_ctl_frames = ntohl(rsp->rx_ctl_frames); + ncs->hnc_rx_64_frames = ntohl(rsp->rx_64_frames); + ncs->hnc_rx_127_frames = ntohl(rsp->rx_127_frames); + ncs->hnc_rx_255_frames = ntohl(rsp->rx_255_frames); + ncs->hnc_rx_511_frames = ntohl(rsp->rx_511_frames); + ncs->hnc_rx_1023_frames = ntohl(rsp->rx_1023_frames); + ncs->hnc_rx_1522_frames = ntohl(rsp->rx_1522_frames); + ncs->hnc_rx_9022_frames = ntohl(rsp->rx_9022_frames); + ncs->hnc_tx_64_frames = ntohl(rsp->tx_64_frames); + ncs->hnc_tx_127_frames = ntohl(rsp->tx_127_frames); + ncs->hnc_tx_255_frames = ntohl(rsp->tx_255_frames); + ncs->hnc_tx_511_frames = ntohl(rsp->tx_511_frames); + ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames); + ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames); + ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames); + ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes); + ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts); + ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts); + + return 0; +} + +static int ncsi_rsp_handler_gns(struct ncsi_request *nr) +{ + struct ncsi_rsp_gns_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_stats *ncs; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gns_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update HNC's statistics */ + ncs = &nc->stats; + ncs->ncsi_rx_cmds = ntohl(rsp->rx_cmds); + ncs->ncsi_dropped_cmds = ntohl(rsp->dropped_cmds); + ncs->ncsi_cmd_type_errs = ntohl(rsp->cmd_type_errs); + ncs->ncsi_cmd_csum_errs = ntohl(rsp->cmd_csum_errs); + ncs->ncsi_rx_pkts = ntohl(rsp->rx_pkts); + ncs->ncsi_tx_pkts = ntohl(rsp->tx_pkts); + ncs->ncsi_tx_aen_pkts = ntohl(rsp->tx_aen_pkts); + + return 0; +} + +static int ncsi_rsp_handler_gnpts(struct ncsi_request *nr) +{ + struct ncsi_rsp_gnpts_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_stats *ncs; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gnpts_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update HNC's statistics */ + ncs = &nc->stats; + ncs->pt_tx_pkts = ntohl(rsp->tx_pkts); + ncs->pt_tx_dropped = ntohl(rsp->tx_dropped); + ncs->pt_tx_channel_err = ntohl(rsp->tx_channel_err); + ncs->pt_tx_us_err = ntohl(rsp->tx_us_err); + ncs->pt_rx_pkts = ntohl(rsp->rx_pkts); + ncs->pt_rx_dropped = ntohl(rsp->rx_dropped); + ncs->pt_rx_channel_err = ntohl(rsp->rx_channel_err); + ncs->pt_rx_us_err = ntohl(rsp->rx_us_err); + ncs->pt_rx_os_err = ntohl(rsp->rx_os_err); + + return 0; +} + +static int ncsi_rsp_handler_gps(struct ncsi_request *nr) +{ + struct ncsi_rsp_gps_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + + /* Find the package */ + rsp = (struct ncsi_rsp_gps_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, NULL); + if (!np) + return -ENODEV; + + return 0; +} + +static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr) +{ + struct ncsi_rsp_gpuuid_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + + /* Find the package */ + rsp = (struct ncsi_rsp_gpuuid_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, NULL); + if (!np) + return -ENODEV; + + memcpy(np->uuid, rsp->uuid, sizeof(rsp->uuid)); + + return 0; +} + +static struct ncsi_rsp_handler { + unsigned char type; + int payload; + int (*handler)(struct ncsi_request *nr); +} ncsi_rsp_handlers[] = { + { NCSI_PKT_RSP_CIS, 4, ncsi_rsp_handler_cis }, + { NCSI_PKT_RSP_SP, 4, ncsi_rsp_handler_sp }, + { NCSI_PKT_RSP_DP, 4, ncsi_rsp_handler_dp }, + { NCSI_PKT_RSP_EC, 4, ncsi_rsp_handler_ec }, + { NCSI_PKT_RSP_DC, 4, ncsi_rsp_handler_dc }, + { NCSI_PKT_RSP_RC, 4, ncsi_rsp_handler_rc }, + { NCSI_PKT_RSP_ECNT, 4, ncsi_rsp_handler_ecnt }, + { NCSI_PKT_RSP_DCNT, 4, ncsi_rsp_handler_dcnt }, + { NCSI_PKT_RSP_AE, 4, ncsi_rsp_handler_ae }, + { NCSI_PKT_RSP_SL, 4, ncsi_rsp_handler_sl }, + { NCSI_PKT_RSP_GLS, 16, ncsi_rsp_handler_gls }, + { NCSI_PKT_RSP_SVF, 4, ncsi_rsp_handler_svf }, + { NCSI_PKT_RSP_EV, 4, ncsi_rsp_handler_ev }, + { NCSI_PKT_RSP_DV, 4, ncsi_rsp_handler_dv }, + { NCSI_PKT_RSP_SMA, 4, ncsi_rsp_handler_sma }, + { NCSI_PKT_RSP_EBF, 4, ncsi_rsp_handler_ebf }, + { NCSI_PKT_RSP_DBF, 4, ncsi_rsp_handler_dbf }, + { NCSI_PKT_RSP_EGMF, 4, ncsi_rsp_handler_egmf }, + { NCSI_PKT_RSP_DGMF, 4, ncsi_rsp_handler_dgmf }, + { NCSI_PKT_RSP_SNFC, 4, ncsi_rsp_handler_snfc }, + { NCSI_PKT_RSP_GVI, 36, ncsi_rsp_handler_gvi }, + { NCSI_PKT_RSP_GC, 32, ncsi_rsp_handler_gc }, + { NCSI_PKT_RSP_GP, -1, ncsi_rsp_handler_gp }, + { NCSI_PKT_RSP_GCPS, 172, ncsi_rsp_handler_gcps }, + { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns }, + { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts }, + { NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps }, + { NCSI_PKT_RSP_OEM, 0, NULL }, + { NCSI_PKT_RSP_PLDM, 0, NULL }, + { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid } +}; + +int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct ncsi_rsp_handler *nrh = NULL; + struct ncsi_dev *nd; + struct ncsi_dev_priv *ndp; + struct ncsi_request *nr; + struct ncsi_pkt_hdr *hdr; + unsigned long flags; + int payload, i, ret; + + /* Find the NCSI device */ + nd = ncsi_find_dev(dev); + ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; + if (!ndp) + return -ENODEV; + + /* Check if it is AEN packet */ + hdr = (struct ncsi_pkt_hdr *)skb_network_header(skb); + if (hdr->type == NCSI_PKT_AEN) + return ncsi_aen_handler(ndp, skb); + + /* Find the handler */ + for (i = 0; i < ARRAY_SIZE(ncsi_rsp_handlers); i++) { + if (ncsi_rsp_handlers[i].type == hdr->type) { + if (ncsi_rsp_handlers[i].handler) + nrh = &ncsi_rsp_handlers[i]; + else + nrh = NULL; + + break; + } + } + + if (!nrh) { + netdev_err(nd->dev, "Received unrecognized packet (0x%x)\n", + hdr->type); + return -ENOENT; + } + + /* Associate with the request */ + spin_lock_irqsave(&ndp->lock, flags); + nr = &ndp->requests[hdr->id]; + if (!nr->used) { + spin_unlock_irqrestore(&ndp->lock, flags); + return -ENODEV; + } + + nr->rsp = skb; + if (!nr->enabled) { + spin_unlock_irqrestore(&ndp->lock, flags); + ret = -ENOENT; + goto out; + } + + /* Validate the packet */ + spin_unlock_irqrestore(&ndp->lock, flags); + payload = nrh->payload; + if (payload < 0) + payload = ntohs(hdr->length); + ret = ncsi_validate_rsp_pkt(nr, payload); + if (ret) + goto out; + + /* Process the packet */ + ret = nrh->handler(nr); +out: + ncsi_free_request(nr); + return ret; +} diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 95e757c37..9266ceebd 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -609,9 +609,8 @@ config NETFILTER_XT_MARK The target allows you to create rules in the "mangle" table which alter the netfilter mark (nfmark) field associated with the packet. - Prior to routing, the nfmark can influence the routing method (see - "Use netfilter MARK value as routing key") and can also be used by - other subsystems to change their behavior. + Prior to routing, the nfmark can influence the routing method and can + also be used by other subsystems to change their behavior. config NETFILTER_XT_CONNMARK tristate 'ctmark target and match support' @@ -753,9 +752,8 @@ config NETFILTER_XT_TARGET_HMARK The target allows you to create rules in the "raw" and "mangle" tables which set the skbuff mark by means of hash calculation within a given - range. The nfmark can influence the routing method (see "Use netfilter - MARK value as routing key") and can also be used by other subsystems to - change their behaviour. + range. The nfmark can influence the routing method and can also be used + by other subsystems to change their behaviour. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index d7024b2ed..5117bcb7d 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -395,6 +395,20 @@ static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_LAST] = "BUG!", }; +static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { + [IP_VS_TCP_S_NONE] = false, + [IP_VS_TCP_S_ESTABLISHED] = true, + [IP_VS_TCP_S_SYN_SENT] = true, + [IP_VS_TCP_S_SYN_RECV] = true, + [IP_VS_TCP_S_FIN_WAIT] = false, + [IP_VS_TCP_S_TIME_WAIT] = false, + [IP_VS_TCP_S_CLOSE] = false, + [IP_VS_TCP_S_CLOSE_WAIT] = false, + [IP_VS_TCP_S_LAST_ACK] = false, + [IP_VS_TCP_S_LISTEN] = false, + [IP_VS_TCP_S_SYNACK] = true, +}; + #define sNO IP_VS_TCP_S_NONE #define sES IP_VS_TCP_S_ESTABLISHED #define sSS IP_VS_TCP_S_SYN_SENT @@ -418,6 +432,13 @@ static const char * tcp_state_name(int state) return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; } +static bool tcp_state_active(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return false; + return tcp_state_active_table[state]; +} + static struct tcp_states_t tcp_states [] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ @@ -540,12 +561,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state != IP_VS_TCP_S_ESTABLISHED)) { + !tcp_state_active(new_state)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state == IP_VS_TCP_S_ESTABLISHED)) { + tcp_state_active(new_state)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9f530adad..9934b0c93 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -83,6 +83,13 @@ void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) spin_lock(lock); while (unlikely(nf_conntrack_locks_all)) { spin_unlock(lock); + + /* + * Order the 'nf_conntrack_locks_all' load vs. the + * spin_unlock_wait() loads below, to ensure + * that 'nf_conntrack_locks_all_lock' is indeed held: + */ + smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ spin_unlock_wait(&nf_conntrack_locks_all_lock); spin_lock(lock); } @@ -128,6 +135,14 @@ static void nf_conntrack_all_lock(void) spin_lock(&nf_conntrack_locks_all_lock); nf_conntrack_locks_all = true; + /* + * Order the above store of 'nf_conntrack_locks_all' against + * the spin_unlock_wait() loads below, such that if + * nf_conntrack_lock() observes 'nf_conntrack_locks_all' + * we must observe nf_conntrack_locks[] held: + */ + smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ + for (i = 0; i < CONNTRACK_LOCKS; i++) { spin_unlock_wait(&nf_conntrack_locks[i]); } @@ -135,7 +150,13 @@ static void nf_conntrack_all_lock(void) static void nf_conntrack_all_unlock(void) { - nf_conntrack_locks_all = false; + /* + * All prior stores must be complete before we clear + * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() + * might observe the false value but not the entire + * critical section: + */ + smp_store_release(&nf_conntrack_locks_all, false); spin_unlock(&nf_conntrack_locks_all_lock); } @@ -327,16 +348,10 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); - - if (nf_ct_zone_add(tmpl, flags, zone) < 0) - goto out_free; - + nf_ct_zone_add(tmpl, zone); atomic_set(&tmpl->ct_general.use, 0); return tmpl; -out_free: - kfree(tmpl); - return NULL; } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); @@ -466,6 +481,23 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } +/* must be called with rcu read lock held */ +void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) +{ + struct hlist_nulls_head *hptr; + unsigned int sequence, hsz; + + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hsz = nf_conntrack_htable_size; + hptr = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + *hash = hptr; + *hsize = hsz; +} +EXPORT_SYMBOL_GPL(nf_conntrack_get_ht); + /* * Warning : * - Caller must take a reference on returned object @@ -824,67 +856,69 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static noinline int early_drop(struct net *net, unsigned int _hash) +static unsigned int early_drop_list(struct net *net, + struct hlist_nulls_head *head) { - /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; - struct nf_conn *tmp; struct hlist_nulls_node *n; - unsigned int i, hash, sequence; - struct nf_conn *ct = NULL; - spinlock_t *lockp; - bool ret = false; + unsigned int drops = 0; + struct nf_conn *tmp; - i = 0; + hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { + tmp = nf_ct_tuplehash_to_ctrack(h); - local_bh_disable(); -restart: - sequence = read_seqcount_begin(&nf_conntrack_generation); - for (; i < NF_CT_EVICTION_RANGE; i++) { - hash = scale_hash(_hash++); - lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; - nf_conntrack_lock(lockp); - if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { - spin_unlock(lockp); - goto restart; - } - hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], - hnnode) { - tmp = nf_ct_tuplehash_to_ctrack(h); - - if (test_bit(IPS_ASSURED_BIT, &tmp->status) || - !net_eq(nf_ct_net(tmp), net) || - nf_ct_is_dying(tmp)) - continue; - - if (atomic_inc_not_zero(&tmp->ct_general.use)) { - ct = tmp; - break; - } - } + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || + !net_eq(nf_ct_net(tmp), net) || + nf_ct_is_dying(tmp)) + continue; - spin_unlock(lockp); - if (ct) - break; + if (!atomic_inc_not_zero(&tmp->ct_general.use)) + continue; + + /* kill only if still in same netns -- might have moved due to + * SLAB_DESTROY_BY_RCU rules. + * + * We steal the timer reference. If that fails timer has + * already fired or someone else deleted it. Just drop ref + * and move to next entry. + */ + if (net_eq(nf_ct_net(tmp), net) && + nf_ct_is_confirmed(tmp) && + del_timer(&tmp->timeout) && + nf_ct_delete(tmp, 0, 0)) + drops++; + + nf_ct_put(tmp); } - local_bh_enable(); + return drops; +} - if (!ct) - return false; +static noinline int early_drop(struct net *net, unsigned int _hash) +{ + unsigned int i; - /* kill only if in same netns -- might have moved due to - * SLAB_DESTROY_BY_RCU rules - */ - if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) { - if (nf_ct_delete(ct, 0, 0)) { - NF_CT_STAT_INC_ATOMIC(net, early_drop); - ret = true; + for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { + struct hlist_nulls_head *ct_hash; + unsigned hash, sequence, drops; + + rcu_read_lock(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hash = scale_hash(_hash++); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + drops = early_drop_list(net, &ct_hash[hash]); + rcu_read_unlock(); + + if (drops) { + NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); + return true; } } - nf_ct_put(ct); - return ret; + return false; } static struct nf_conn * @@ -930,16 +964,13 @@ __nf_conntrack_alloc(struct net *net, offsetof(struct nf_conn, proto) - offsetof(struct nf_conn, __nfct_init_offset[0])); - if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0) - goto out_free; + nf_ct_zone_add(ct, zone); /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ atomic_set(&ct->ct_general.use, 0); return ct; -out_free: - kmem_cache_free(nf_conntrack_cachep, ct); out: atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); @@ -1004,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; - if (tmpl && nfct_synproxy(tmpl)) { - nfct_seqadj_ext_add(ct); - nfct_synproxy_ext_add(ct); + if (!nf_ct_add_synproxy(ct, tmpl)) { + nf_conntrack_free(ct); + return ERR_PTR(-ENOMEM); } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; @@ -1343,14 +1374,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); -#ifdef CONFIG_NF_CONNTRACK_ZONES -static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { - .len = sizeof(struct nf_conntrack_zone), - .align = __alignof__(struct nf_conntrack_zone), - .id = NF_CT_EXT_ZONE, -}; -#endif - #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include @@ -1533,9 +1556,6 @@ void nf_conntrack_cleanup_end(void) nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); -#ifdef CONFIG_NF_CONNTRACK_ZONES - nf_ct_extend_unregister(&nf_ct_zone_extend); -#endif nf_conntrack_proto_fini(); nf_conntrack_seqadj_fini(); nf_conntrack_labels_fini(); @@ -1625,24 +1645,14 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) } EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +int nf_conntrack_hash_resize(unsigned int hashsize) { - int i, bucket, rc; - unsigned int hashsize, old_size; + int i, bucket; + unsigned int old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - if (current->nsproxy->net_ns != &init_net) - return -EOPNOTSUPP; - - /* On boot, we can set this without any fancy locking. */ - if (!nf_conntrack_htable_size) - return param_set_uint(val, kp); - - rc = kstrtouint(val, 0, &hashsize); - if (rc) - return rc; if (!hashsize) return -EINVAL; @@ -1650,6 +1660,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hash) return -ENOMEM; + old_size = nf_conntrack_htable_size; + if (old_size == hashsize) { + nf_ct_free_hashtable(hash, hashsize); + return 0; + } + local_bh_disable(); nf_conntrack_all_lock(); write_seqcount_begin(&nf_conntrack_generation); @@ -1685,6 +1701,25 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) nf_ct_free_hashtable(old_hash, old_size); return 0; } + +int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +{ + unsigned int hashsize; + int rc; + + if (current->nsproxy->net_ns != &init_net) + return -EOPNOTSUPP; + + /* On boot, we can set this without any fancy locking. */ + if (!nf_conntrack_htable_size) + return param_set_uint(val, kp); + + rc = kstrtouint(val, 0, &hashsize); + if (rc) + return rc; + + return nf_conntrack_hash_resize(hashsize); +} EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, @@ -1741,7 +1776,7 @@ int nf_conntrack_init_start(void) nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), 0, - SLAB_DESTROY_BY_RCU, NULL); + SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; @@ -1781,11 +1816,6 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_seqadj; -#ifdef CONFIG_NF_CONNTRACK_ZONES - ret = nf_ct_extend_register(&nf_ct_zone_extend); - if (ret < 0) - goto err_extend; -#endif ret = nf_conntrack_proto_init(); if (ret < 0) goto err_proto; @@ -1801,10 +1831,6 @@ int nf_conntrack_init_start(void) return 0; err_proto: -#ifdef CONFIG_NF_CONNTRACK_ZONES - nf_ct_extend_unregister(&nf_ct_zone_extend); -err_extend: -#endif nf_conntrack_seqadj_fini(); err_seqadj: nf_conntrack_labels_fini(); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 9e3693128..f8dbacf66 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -574,7 +574,7 @@ static int exp_seq_show(struct seq_file *s, void *v) helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); - if (helper->expect_policy[expect->class].name) + if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 1a9545965..02bcf00c2 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -73,7 +73,7 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, size_t var_alloc_len, gfp_t gfp) { struct nf_ct_ext *old, *new; - int i, newlen, newoff; + int newlen, newoff; struct nf_ct_ext_type *t; /* Conntrack must not be confirmed to avoid races on reallocation. */ @@ -99,19 +99,8 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, return NULL; if (new != old) { - for (i = 0; i < NF_CT_EXT_NUM; i++) { - if (!__nf_ct_ext_exist(old, i)) - continue; - - rcu_read_lock(); - t = rcu_dereference(nf_ct_ext_types[i]); - if (t && t->move) - t->move((void *)new + new->offset[i], - (void *)old + old->offset[i]); - rcu_read_unlock(); - } kfree_rcu(old, rcu); - ct->ext = new; + rcu_assign_pointer(ct->ext, new); } new->offset[id] = newoff; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 19efeba02..43147005b 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -572,7 +572,7 @@ static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct) return 0; } -static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper ftp[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy ftp_exp_policy = { .max_expected = 1, @@ -582,24 +582,13 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = { /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_ftp_fini(void) { - int i, j; - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) { - if (ftp[i][j].me == NULL) - continue; - - pr_debug("unregistering helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - nf_conntrack_helper_unregister(&ftp[i][j]); - } - } - + nf_conntrack_helpers_unregister(ftp, ports_c * 2); kfree(ftp_buffer); } static int __init nf_conntrack_ftp_init(void) { - int i, j = -1, ret = 0; + int i, ret = 0; ftp_buffer = kmalloc(65536, GFP_KERNEL); if (!ftp_buffer) @@ -611,32 +600,21 @@ static int __init nf_conntrack_ftp_init(void) /* FIXME should be configurable whether IPv4 and IPv6 FTP connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { - ftp[i][0].tuple.src.l3num = PF_INET; - ftp[i][1].tuple.src.l3num = PF_INET6; - for (j = 0; j < 2; j++) { - ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master); - ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); - ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; - ftp[i][j].expect_policy = &ftp_exp_policy; - ftp[i][j].me = THIS_MODULE; - ftp[i][j].help = help; - ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr; - if (ports[i] == FTP_PORT) - sprintf(ftp[i][j].name, "ftp"); - else - sprintf(ftp[i][j].name, "ftp-%d", ports[i]); - - pr_debug("registering helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - ret = nf_conntrack_helper_register(&ftp[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_ftp_fini(); - return ret; - } - } + nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, "ftp", + FTP_PORT, ports[i], ports[i], &ftp_exp_policy, + 0, sizeof(struct nf_ct_ftp_master), help, + nf_ct_ftp_from_nlattr, THIS_MODULE); + nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, "ftp", + FTP_PORT, ports[i], ports[i], &ftp_exp_policy, + 0, sizeof(struct nf_ct_ftp_master), help, + nf_ct_ftp_from_nlattr, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(ftp, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + kfree(ftp_buffer); + return ret; } return 0; diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index bcd5ed6b7..89b2e4692 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -846,9 +846,10 @@ int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) sz -= len; /* Message Type */ - if (sz < 1) + if (sz < 2) return H323_ERROR_BOUND; q931->MessageType = *p++; + sz--; PRINT("MessageType = %02X\n", q931->MessageType); if (*p & 0x80) { p++; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 9511af04d..5c0db5c64 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1272,19 +1272,6 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, return NULL; } -/****************************************************************************/ -static int set_expect_timeout(struct nf_conntrack_expect *exp, - unsigned int timeout) -{ - if (!exp || !del_timer(&exp->timeout)) - return 0; - - exp->timeout.expires = jiffies + timeout * HZ; - add_timer(&exp->timeout); - - return 1; -} - /****************************************************************************/ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -1486,7 +1473,8 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, "timeout to %u seconds for", info->timeout); nf_ct_dump_tuple(&exp->tuple); - set_expect_timeout(exp, info->timeout); + mod_timer_pending(&exp->timeout, + jiffies + info->timeout * HZ); } spin_unlock_bh(&nf_conntrack_expect_lock); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 196cb3964..b989b81ac 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -387,13 +387,42 @@ EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, struct net *net) +{ + struct nf_conntrack_tuple_hash *h; + const struct hlist_nulls_node *nn; + int cpu; + + /* Get rid of expecteds, set helpers to NULL. */ + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) + unhelp(h, me); + spin_unlock_bh(&pcpu->lock); + } +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_expect *exp; const struct hlist_node *next; const struct hlist_nulls_node *nn; + unsigned int last_hsize; + spinlock_t *lock; + struct net *net; unsigned int i; - int cpu; + + mutex_lock(&nf_ct_helper_mutex); + hlist_del_rcu(&me->hnode); + nf_ct_helper_count--; + mutex_unlock(&nf_ct_helper_mutex); + + /* Make sure every nothing is still using the helper unless its a + * connection in the hash. + */ + synchronize_rcu(); /* Get rid of expectations */ spin_lock_bh(&nf_conntrack_expect_lock); @@ -413,47 +442,85 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, } spin_unlock_bh(&nf_conntrack_expect_lock); - /* Get rid of expecteds, set helpers to NULL. */ - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + rtnl_lock(); + for_each_net(net) + __nf_conntrack_helper_unregister(me, net); + rtnl_unlock(); - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) - unhelp(h, me); - spin_unlock_bh(&pcpu->lock); - } local_bh_disable(); - for (i = 0; i < nf_conntrack_htable_size; i++) { - nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) - unhelp(h, me); +restart: + last_hsize = nf_conntrack_htable_size; + for (i = 0; i < last_hsize; i++) { + lock = &nf_conntrack_locks[i % CONNTRACK_LOCKS]; + nf_conntrack_lock(lock); + if (last_hsize != nf_conntrack_htable_size) { + spin_unlock(lock); + goto restart; } - spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) + unhelp(h, me); + spin_unlock(lock); } local_bh_enable(); } +EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); -void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +void nf_ct_helper_init(struct nf_conntrack_helper *helper, + u16 l3num, u16 protonum, const char *name, + u16 default_port, u16 spec_port, u32 id, + const struct nf_conntrack_expect_policy *exp_pol, + u32 expect_class_max, u32 data_len, + int (*help)(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo), + int (*from_nlattr)(struct nlattr *attr, + struct nf_conn *ct), + struct module *module) { - struct net *net; + helper->tuple.src.l3num = l3num; + helper->tuple.dst.protonum = protonum; + helper->tuple.src.u.all = htons(spec_port); + helper->expect_policy = exp_pol; + helper->expect_class_max = expect_class_max; + helper->data_len = data_len; + helper->help = help; + helper->from_nlattr = from_nlattr; + helper->me = module; + + if (spec_port == default_port) + snprintf(helper->name, sizeof(helper->name), "%s", name); + else + snprintf(helper->name, sizeof(helper->name), "%s-%u", name, id); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_init); - mutex_lock(&nf_ct_helper_mutex); - hlist_del_rcu(&me->hnode); - nf_ct_helper_count--; - mutex_unlock(&nf_ct_helper_mutex); +int nf_conntrack_helpers_register(struct nf_conntrack_helper *helper, + unsigned int n) +{ + unsigned int i; + int err = 0; - /* Make sure every nothing is still using the helper unless its a - * connection in the hash. - */ - synchronize_rcu(); + for (i = 0; i < n; i++) { + err = nf_conntrack_helper_register(&helper[i]); + if (err < 0) + goto err; + } - rtnl_lock(); - for_each_net(net) - __nf_conntrack_helper_unregister(me, net); - rtnl_unlock(); + return err; +err: + if (i > 0) + nf_conntrack_helpers_unregister(helper, i); + return err; } -EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); +EXPORT_SYMBOL_GPL(nf_conntrack_helpers_register); + +void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper, + unsigned int n) +{ + while (n-- > 0) + nf_conntrack_helper_unregister(&helper[n]); +} +EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister); static struct nf_ct_ext_type helper_extend __read_mostly = { .len = sizeof(struct nf_conn_help), diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index f97ac61d2..1972a149f 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -255,27 +255,18 @@ static int __init nf_conntrack_irc_init(void) ports[ports_c++] = IRC_PORT; for (i = 0; i < ports_c; i++) { - irc[i].tuple.src.l3num = AF_INET; - irc[i].tuple.src.u.tcp.port = htons(ports[i]); - irc[i].tuple.dst.protonum = IPPROTO_TCP; - irc[i].expect_policy = &irc_exp_policy; - irc[i].me = THIS_MODULE; - irc[i].help = help; - - if (ports[i] == IRC_PORT) - sprintf(irc[i].name, "irc"); - else - sprintf(irc[i].name, "irc-%u", i); - - ret = nf_conntrack_helper_register(&irc[i]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - irc[i].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_irc_fini(); - return ret; - } + nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, "irc", + IRC_PORT, ports[i], i, &irc_exp_policy, + 0, 0, help, NULL, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(&irc[0], ports_c); + if (ret) { + pr_err("failed to register helpers\n"); + kfree(irc_buffer); + return ret; } + return 0; } @@ -283,10 +274,7 @@ static int __init nf_conntrack_irc_init(void) * it is needed by the init function */ static void nf_conntrack_irc_fini(void) { - int i; - - for (i = 0; i < ports_c; i++) - nf_conntrack_helper_unregister(&irc[i]); + nf_conntrack_helpers_unregister(irc, ports_c); kfree(irc_buffer); } diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 252e6a7cd..bcab8bde7 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -16,23 +16,6 @@ static spinlock_t nf_connlabels_lock; -int nf_connlabel_set(struct nf_conn *ct, u16 bit) -{ - struct nf_conn_labels *labels = nf_ct_labels_find(ct); - - if (!labels || BIT_WORD(bit) >= labels->words) - return -ENOSPC; - - if (test_bit(bit, labels->bits)) - return 0; - - if (!test_and_set_bit(bit, labels->bits)) - nf_conntrack_event_cache(IPCT_LABEL, ct); - - return 0; -} -EXPORT_SYMBOL_GPL(nf_connlabel_set); - static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; @@ -60,7 +43,7 @@ int nf_connlabels_replace(struct nf_conn *ct, if (!labels) return -ENOSPC; - size = labels->words * sizeof(long); + size = sizeof(labels->bits); if (size < (words32 * sizeof(u32))) words32 = size / sizeof(u32); @@ -80,16 +63,11 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { - size_t words; - - words = BIT_WORD(bits) + 1; - if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long)) + if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; spin_lock(&nf_connlabels_lock); net->ct.labels_used++; - if (words > net->ct.label_words) - net->ct.label_words = words; spin_unlock(&nf_connlabels_lock); return 0; @@ -100,8 +78,6 @@ void nf_connlabels_put(struct net *net) { spin_lock(&nf_connlabels_lock); net->ct.labels_used--; - if (net->ct.labels_used == 0) - net->ct.label_words = 0; spin_unlock(&nf_connlabels_lock); } EXPORT_SYMBOL_GPL(nf_connlabels_put); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index a18d1ceab..fdfc71f41 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -346,25 +346,25 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct) if (!labels) return 0; - return nla_total_size(labels->words * sizeof(long)); + return nla_total_size(sizeof(labels->bits)); } static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - unsigned int len, i; + unsigned int i; if (!labels) return 0; - len = labels->words * sizeof(long); i = 0; do { if (labels->bits[i] != 0) - return nla_put(skb, CTA_LABELS, len, labels->bits); + return nla_put(skb, CTA_LABELS, sizeof(labels->bits), + labels->bits); i++; - } while (i < labels->words); + } while (i < ARRAY_SIZE(labels->bits)); return 0; } @@ -1894,6 +1894,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) return -EINVAL; + if (otuple.dst.protonum != rtuple.dst.protonum) + return -EINVAL; ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple, &rtuple, u3); @@ -2362,12 +2364,8 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, return PTR_ERR(exp); err = nf_ct_expect_related_report(exp, portid, report); - if (err < 0) { - nf_ct_expect_put(exp); - return err; - } - - return 0; + nf_ct_expect_put(exp); + return err; } static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 3fcbaab83..9dcb9ee9b 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -166,7 +166,7 @@ out: return ret; } -static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper sane[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy sane_exp_policy = { .max_expected = 1, @@ -176,22 +176,13 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = { /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_sane_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) { - pr_debug("unregistering helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - nf_conntrack_helper_unregister(&sane[i][j]); - } - } - + nf_conntrack_helpers_unregister(sane, ports_c * 2); kfree(sane_buffer); } static int __init nf_conntrack_sane_init(void) { - int i, j = -1, ret = 0; + int i, ret = 0; sane_buffer = kmalloc(65536, GFP_KERNEL); if (!sane_buffer) @@ -203,31 +194,23 @@ static int __init nf_conntrack_sane_init(void) /* FIXME should be configurable whether IPv4 and IPv6 connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { - sane[i][0].tuple.src.l3num = PF_INET; - sane[i][1].tuple.src.l3num = PF_INET6; - for (j = 0; j < 2; j++) { - sane[i][j].data_len = sizeof(struct nf_ct_sane_master); - sane[i][j].tuple.src.u.tcp.port = htons(ports[i]); - sane[i][j].tuple.dst.protonum = IPPROTO_TCP; - sane[i][j].expect_policy = &sane_exp_policy; - sane[i][j].me = THIS_MODULE; - sane[i][j].help = help; - if (ports[i] == SANE_PORT) - sprintf(sane[i][j].name, "sane"); - else - sprintf(sane[i][j].name, "sane-%d", ports[i]); - - pr_debug("registering helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - ret = nf_conntrack_helper_register(&sane[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_sane_fini(); - return ret; - } - } + nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, "sane", + SANE_PORT, ports[i], ports[i], + &sane_exp_policy, 0, + sizeof(struct nf_ct_sane_master), help, NULL, + THIS_MODULE); + nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, "sane", + SANE_PORT, ports[i], ports[i], + &sane_exp_policy, 0, + sizeof(struct nf_ct_sane_master), help, NULL, + THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(sane, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + kfree(sane_buffer); + return ret; } return 0; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index f72ba5587..7d77217de 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1383,7 +1383,7 @@ static int process_sip_response(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } cseq = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!cseq) { + if (!cseq && *(*dptr + matchoff) != '0') { nf_ct_helper_log(skb, ct, "cannot get cseq"); return NF_DROP; } @@ -1446,7 +1446,7 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } cseq = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!cseq) { + if (!cseq && *(*dptr + matchoff) != '0') { nf_ct_helper_log(skb, ct, "cannot get cseq"); return NF_DROP; } @@ -1589,7 +1589,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen); } -static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; +static struct nf_conntrack_helper sip[MAX_PORTS * 4] __read_mostly; static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { [SIP_EXPECT_SIGNALLING] = { @@ -1616,20 +1616,12 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1 static void nf_conntrack_sip_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { - if (sip[i][j].me == NULL) - continue; - nf_conntrack_helper_unregister(&sip[i][j]); - } - } + nf_conntrack_helpers_unregister(sip, ports_c * 4); } static int __init nf_conntrack_sip_init(void) { - int i, j, ret; + int i, ret; if (ports_c == 0) ports[ports_c++] = SIP_PORT; @@ -1637,43 +1629,32 @@ static int __init nf_conntrack_sip_init(void) for (i = 0; i < ports_c; i++) { memset(&sip[i], 0, sizeof(sip[i])); - sip[i][0].tuple.src.l3num = AF_INET; - sip[i][0].tuple.dst.protonum = IPPROTO_UDP; - sip[i][0].help = sip_help_udp; - sip[i][1].tuple.src.l3num = AF_INET; - sip[i][1].tuple.dst.protonum = IPPROTO_TCP; - sip[i][1].help = sip_help_tcp; - - sip[i][2].tuple.src.l3num = AF_INET6; - sip[i][2].tuple.dst.protonum = IPPROTO_UDP; - sip[i][2].help = sip_help_udp; - sip[i][3].tuple.src.l3num = AF_INET6; - sip[i][3].tuple.dst.protonum = IPPROTO_TCP; - sip[i][3].help = sip_help_tcp; - - for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { - sip[i][j].data_len = sizeof(struct nf_ct_sip_master); - sip[i][j].tuple.src.u.udp.port = htons(ports[i]); - sip[i][j].expect_policy = sip_exp_policy; - sip[i][j].expect_class_max = SIP_EXPECT_MAX; - sip[i][j].me = THIS_MODULE; - - if (ports[i] == SIP_PORT) - sprintf(sip[i][j].name, "sip"); - else - sprintf(sip[i][j].name, "sip-%u", i); - - pr_debug("port #%u: %u\n", i, ports[i]); + nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_udp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_tcp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_udp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_tcp, + NULL, THIS_MODULE); + } - ret = nf_conntrack_helper_register(&sip[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - sip[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_sip_fini(); - return ret; - } - } + ret = nf_conntrack_helpers_register(sip, ports_c * 4); + if (ret < 0) { + pr_err("failed to register helpers\n"); + return ret; } return 0; } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index c026c472e..9f267c3ff 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -48,6 +48,8 @@ EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; + struct hlist_nulls_head *hash; + unsigned int htable_size; unsigned int bucket; u_int64_t time_now; }; @@ -58,9 +60,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < st->htable_size; st->bucket++) { - n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + n = rcu_dereference( + hlist_nulls_first_rcu(&st->hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -75,12 +78,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= st->htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu( - &nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); } return head; } @@ -102,6 +104,8 @@ static void *ct_seq_start(struct seq_file *seq, loff_t *pos) st->time_now = ktime_get_real_ns(); rcu_read_lock(); + + nf_conntrack_get_ht(&st->hash, &st->htable_size); return ct_get_idx(seq, *pos); } @@ -201,6 +205,7 @@ static int ct_seq_show(struct seq_file *s, void *v) struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; + struct net *net = seq_file_net(s); int ret = 0; NF_CT_ASSERT(ct); @@ -211,6 +216,9 @@ static int ct_seq_show(struct seq_file *s, void *v) if (NF_CT_DIRECTION(hash)) goto release; + if (!net_eq(nf_ct_net(ct), net)) + goto release; + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); NF_CT_ASSERT(l3proto); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); @@ -434,8 +442,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) #ifdef CONFIG_SYSCTL /* Log invalid packets of a given protocol */ -static int log_invalid_proto_min = 0; -static int log_invalid_proto_max = 255; +static int log_invalid_proto_min __read_mostly; +static int log_invalid_proto_max __read_mostly = 255; + +/* size the user *wants to set */ +static unsigned int nf_conntrack_htable_size_user __read_mostly; + +static int +nf_conntrack_hash_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret < 0 || !write) + return ret; + + /* update ret, we might not be able to satisfy request */ + ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user); + + /* update it to the actual value used by conntrack */ + nf_conntrack_htable_size_user = nf_conntrack_htable_size; + return ret; +} static struct ctl_table_header *nf_ct_netfilter_header; @@ -456,10 +485,10 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &nf_conntrack_htable_size_user, .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = nf_conntrack_hash_sysctl, }, { .procname = "nf_conntrack_checksum", @@ -515,6 +544,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) if (net->user_ns != &init_user_ns) table[0].procname = NULL; + if (!net_eq(&init_net, net)) + table[2].mode = 0444; + net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.sysctl_header) goto out_unregister_netfilter; @@ -604,6 +636,8 @@ static int __init nf_conntrack_standalone_init(void) ret = -ENOMEM; goto out_sysctl; } + + nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif ret = register_pernet_subsys(&nf_conntrack_net_ops); diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 2e65b5430..b1227dc6f 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -97,7 +97,7 @@ static int tftp_help(struct sk_buff *skb, return ret; } -static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper tftp[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy tftp_exp_policy = { .max_expected = 1, @@ -106,47 +106,29 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = { static void nf_conntrack_tftp_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) - nf_conntrack_helper_unregister(&tftp[i][j]); - } + nf_conntrack_helpers_unregister(tftp, ports_c * 2); } static int __init nf_conntrack_tftp_init(void) { - int i, j, ret; + int i, ret; if (ports_c == 0) ports[ports_c++] = TFTP_PORT; for (i = 0; i < ports_c; i++) { - memset(&tftp[i], 0, sizeof(tftp[i])); - - tftp[i][0].tuple.src.l3num = AF_INET; - tftp[i][1].tuple.src.l3num = AF_INET6; - for (j = 0; j < 2; j++) { - tftp[i][j].tuple.dst.protonum = IPPROTO_UDP; - tftp[i][j].tuple.src.u.udp.port = htons(ports[i]); - tftp[i][j].expect_policy = &tftp_exp_policy; - tftp[i][j].me = THIS_MODULE; - tftp[i][j].help = tftp_help; - - if (ports[i] == TFTP_PORT) - sprintf(tftp[i][j].name, "tftp"); - else - sprintf(tftp[i][j].name, "tftp-%u", i); - - ret = nf_conntrack_helper_register(&tftp[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - tftp[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_tftp_fini(); - return ret; - } - } + nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, "tftp", + TFTP_PORT, ports[i], i, &tftp_exp_policy, + 0, 0, tftp_help, NULL, THIS_MODULE); + nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, "tftp", + TFTP_PORT, ports[i], i, &tftp_exp_policy, + 0, 0, tftp_help, NULL, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(tftp, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + return ret; } return 0; } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index a5d41dfa9..aa5847a16 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -159,6 +159,20 @@ int nf_logger_find_get(int pf, enum nf_log_type type) struct nf_logger *logger; int ret = -ENOENT; + if (pf == NFPROTO_INET) { + ret = nf_logger_find_get(NFPROTO_IPV4, type); + if (ret < 0) + return ret; + + ret = nf_logger_find_get(NFPROTO_IPV6, type); + if (ret < 0) { + nf_logger_put(NFPROTO_IPV4, type); + return ret; + } + + return 0; + } + if (rcu_access_pointer(loggers[pf][type]) == NULL) request_module("nf-logger-%u-%u", pf, type); @@ -167,7 +181,7 @@ int nf_logger_find_get(int pf, enum nf_log_type type) if (logger == NULL) goto out; - if (logger && try_module_get(logger->me)) + if (try_module_get(logger->me)) ret = 0; out: rcu_read_unlock(); @@ -179,6 +193,12 @@ void nf_logger_put(int pf, enum nf_log_type type) { struct nf_logger *logger; + if (pf == NFPROTO_INET) { + nf_logger_put(NFPROTO_IPV4, type); + nf_logger_put(NFPROTO_IPV6, type); + return; + } + BUG_ON(loggers[pf][type] == NULL); rcu_read_lock(); @@ -398,16 +418,17 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write, { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; - size_t size = *lenp; int r = 0; int tindex = (unsigned long)table->extra1; struct net *net = current->nsproxy->net_ns; if (write) { - if (size > sizeof(buf)) - size = sizeof(buf); - if (copy_from_user(buf, buffer, size)) - return -EFAULT; + struct ctl_table tmp = *table; + + tmp.data = buf; + r = proc_dostring(&tmp, write, buffer, lenp, ppos); + if (r) + return r; if (!strcmp(buf, "NONE")) { nf_log_unbind_pf(net, tindex); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6877a396f..ecee105bb 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,17 +30,19 @@ #include #include -static DEFINE_SPINLOCK(nf_nat_lock); - static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; -static struct hlist_head *nf_nat_bysource __read_mostly; -static unsigned int nf_nat_htable_size __read_mostly; -static unsigned int nf_nat_hash_rnd __read_mostly; +struct nf_nat_conn_key { + const struct net *net; + const struct nf_conntrack_tuple *tuple; + const struct nf_conntrack_zone *zone; +}; + +static struct rhashtable nf_nat_bysource_table; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -119,19 +121,17 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ -/* We keep an extra hash for each conntrack, for fast searching. */ -static inline unsigned int -hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) +static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) { - unsigned int hash; - - get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + const struct nf_conntrack_tuple *t; + const struct nf_conn *ct = data; + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* Original src, to ensure we map it consistently if poss. */ - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, nf_nat_htable_size); + seed ^= net_hash_mix(nf_ct_net(ct)); + return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), + t->dst.protonum ^ seed); } /* Is this tuple already taken? (not by us) */ @@ -187,6 +187,26 @@ same_src(const struct nf_conn *ct, t->src.u.all == tuple->src.u.all); } +static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct nf_nat_conn_key *key = arg->key; + const struct nf_conn *ct = obj; + + return same_src(ct, key->tuple) && + net_eq(nf_ct_net(ct), key->net) && + nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL); +} + +static struct rhashtable_params nf_nat_bysource_params = { + .head_offset = offsetof(struct nf_conn, nat_bysource), + .obj_hashfn = nf_nat_bysource_hash, + .obj_cmpfn = nf_nat_bysource_cmp, + .nelem_hint = 256, + .min_size = 1024, + .nulls_base = (1U << RHT_BASE_SHIFT), +}; + /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, @@ -197,25 +217,23 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(net, tuple); - const struct nf_conn_nat *nat; const struct nf_conn *ct; + struct nf_nat_conn_key key = { + .net = net, + .tuple = tuple, + .zone = zone + }; - hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) { - ct = nat->ct; - if (same_src(ct, tuple) && - net_eq(net, nf_ct_net(ct)) && - nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { - /* Copy source part from reply tuple. */ - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(l3proto, l4proto, result, range)) - return 1; - } - } - return 0; + ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key, + nf_nat_bysource_params); + if (!ct) + return 0; + + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + return in_range(l3proto, l4proto, result, range); } /* For [FUTURE] fragmentation handling, we want the least-used @@ -387,7 +405,6 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { - struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; @@ -424,21 +441,18 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; if (nfct_help(ct)) - nfct_seqadj_ext_add(ct); + if (!nfct_seqadj_ext_add(ct)) + return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { - unsigned int srchash; - - srchash = hash_by_src(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_lock); - /* nf_conntrack_alter_reply might re-allocate extension aera */ - nat = nfct_nat(ct); - nat->ct = ct; - hlist_add_head_rcu(&nat->bysource, - &nf_nat_bysource[srchash]); - spin_unlock_bh(&nf_nat_lock); + int err; + + err = rhashtable_insert_fast(&nf_nat_bysource_table, + &ct->nat_bysource, + nf_nat_bysource_params); + if (err) + return NF_DROP; } /* It's done. */ @@ -543,7 +557,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) if (nf_nat_proto_remove(ct, data)) return 1; - if (!nat || !nat->ct) + if (!nat) return 0; /* This netns is being destroyed, and conntrack has nat null binding. @@ -555,11 +569,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) if (!del_timer(&ct->timeout)) return 1; - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); ct->status &= ~IPS_NAT_DONE_MASK; - nat->ct = NULL; - spin_unlock_bh(&nf_nat_lock); + + rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); add_timer(&ct->timeout); @@ -688,35 +701,17 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); - if (nat == NULL || nat->ct == NULL) - return; - - NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); - - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); - spin_unlock_bh(&nf_nat_lock); -} - -static void nf_nat_move_storage(void *new, void *old) -{ - struct nf_conn_nat *new_nat = new; - struct nf_conn_nat *old_nat = old; - struct nf_conn *ct = old_nat->ct; - - if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) + if (!nat) return; - spin_lock_bh(&nf_nat_lock); - hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); - spin_unlock_bh(&nf_nat_lock); + rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); } static struct nf_ct_ext_type nat_extend __read_mostly = { .len = sizeof(struct nf_conn_nat), .align = __alignof__(struct nf_conn_nat), .destroy = nf_nat_cleanup_conntrack, - .move = nf_nat_move_storage, .id = NF_CT_EXT_NAT, .flags = NF_CT_EXT_F_PREALLOC, }; @@ -813,7 +808,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, if (err < 0) return err; - return nf_nat_setup_info(ct, &range, manip); + return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int @@ -845,16 +840,13 @@ static int __init nf_nat_init(void) { int ret; - /* Leave them the same for the moment. */ - nf_nat_htable_size = nf_conntrack_htable_size; - - nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); - if (!nf_nat_bysource) - return -ENOMEM; + ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); + if (ret) + return ret; ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + rhashtable_destroy(&nf_nat_bysource_table); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -878,7 +870,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + rhashtable_destroy(&nf_nat_bysource_table); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -896,8 +888,8 @@ static void __exit nf_nat_cleanup(void) #endif for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); - synchronize_net(); - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + + rhashtable_destroy(&nf_nat_bysource_table); } MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cf7c74599..7e1c876c7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -131,29 +131,8 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -static int nft_register_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) -{ - struct net *net = read_pnet(&basechain->pnet); - - if (basechain->flags & NFT_BASECHAIN_DISABLED) - return 0; - - return nf_register_net_hooks(net, basechain->ops, hook_nops); -} - -static void nft_unregister_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) -{ - struct net *net = read_pnet(&basechain->pnet); - - if (basechain->flags & NFT_BASECHAIN_DISABLED) - return; - - nf_unregister_net_hooks(net, basechain->ops, hook_nops); -} - -static int nf_tables_register_hooks(const struct nft_table *table, +static int nf_tables_register_hooks(struct net *net, + const struct nft_table *table, struct nft_chain *chain, unsigned int hook_nops) { @@ -161,10 +140,12 @@ static int nf_tables_register_hooks(const struct nft_table *table, !(chain->flags & NFT_BASE_CHAIN)) return 0; - return nft_register_basechain(nft_base_chain(chain), hook_nops); + return nf_register_net_hooks(net, nft_base_chain(chain)->ops, + hook_nops); } -static void nf_tables_unregister_hooks(const struct nft_table *table, +static void nf_tables_unregister_hooks(struct net *net, + const struct nft_table *table, struct nft_chain *chain, unsigned int hook_nops) { @@ -172,12 +153,9 @@ static void nf_tables_unregister_hooks(const struct nft_table *table, !(chain->flags & NFT_BASE_CHAIN)) return; - nft_unregister_basechain(nft_base_chain(chain), hook_nops); + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, hook_nops); } -/* Internal table flags */ -#define NFT_TABLE_INACTIVE (1 << 15) - static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; @@ -187,7 +165,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) return -ENOMEM; if (msg_type == NFT_MSG_NEWTABLE) - ctx->table->flags |= NFT_TABLE_INACTIVE; + nft_activate_next(ctx->net, ctx->table); list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -201,7 +179,7 @@ static int nft_deltable(struct nft_ctx *ctx) if (err < 0) return err; - list_del_rcu(&ctx->table->list); + nft_deactivate_next(ctx->net, ctx->table); return err; } @@ -214,7 +192,7 @@ static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) return -ENOMEM; if (msg_type == NFT_MSG_NEWCHAIN) - ctx->chain->flags |= NFT_CHAIN_INACTIVE; + nft_activate_next(ctx->net, ctx->chain); list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -229,47 +207,17 @@ static int nft_delchain(struct nft_ctx *ctx) return err; ctx->table->use--; - list_del_rcu(&ctx->chain->list); + nft_deactivate_next(ctx->net, ctx->chain); return err; } -static inline bool -nft_rule_is_active(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & nft_genmask_cur(net)) == 0; -} - -static inline int -nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & nft_genmask_next(net)) == 0; -} - -static inline void -nft_rule_activate_next(struct net *net, struct nft_rule *rule) -{ - /* Now inactive, will be active in the future */ - rule->genmask = nft_genmask_cur(net); -} - -static inline void -nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) -{ - rule->genmask = nft_genmask_next(net); -} - -static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) -{ - rule->genmask &= ~nft_genmask_next(net); -} - static int nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ - if (nft_rule_is_active_next(ctx->net, rule)) { - nft_rule_deactivate_next(ctx->net, rule); + if (nft_is_active_next(ctx->net, rule)) { + nft_deactivate_next(ctx->net, rule); ctx->chain->use--; return 0; } @@ -322,9 +270,6 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -/* Internal set flag */ -#define NFT_SET_INACTIVE (1 << 15) - static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, struct nft_set *set) { @@ -337,7 +282,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); - set->flags |= NFT_SET_INACTIVE; + nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -353,7 +298,7 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; - list_del_rcu(&set->list); + nft_deactivate_next(ctx->net, set); ctx->table->use--; return err; @@ -364,26 +309,29 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) */ static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_table *table; list_for_each_entry(table, &afi->tables, list) { - if (!nla_strcmp(nla, table->name)) + if (!nla_strcmp(nla, table->name) && + nft_active_genmask(table, genmask)) return table; } return NULL; } static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - table = nft_table_lookup(afi, nla); + table = nft_table_lookup(afi, nla, genmask); if (table != NULL) return table; @@ -524,6 +472,8 @@ static int nf_tables_dump_tables(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, table)) + continue; if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -548,6 +498,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; struct sk_buff *skb2; @@ -565,11 +516,9 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -588,17 +537,21 @@ err: return err; } -static int nf_tables_table_enable(const struct nft_af_info *afi, +static int nf_tables_table_enable(struct net *net, + const struct nft_af_info *afi, struct nft_table *table) { struct nft_chain *chain; int err, i = 0; list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; if (!(chain->flags & NFT_BASE_CHAIN)) continue; - err = nft_register_basechain(nft_base_chain(chain), afi->nops); + err = nf_register_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); if (err < 0) goto err; @@ -607,26 +560,34 @@ static int nf_tables_table_enable(const struct nft_af_info *afi, return 0; err: list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; if (!(chain->flags & NFT_BASE_CHAIN)) continue; if (i-- <= 0) break; - nft_unregister_basechain(nft_base_chain(chain), afi->nops); + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); } return err; } -static void nf_tables_table_disable(const struct nft_af_info *afi, +static void nf_tables_table_disable(struct net *net, + const struct nft_af_info *afi, struct nft_table *table) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { - if (chain->flags & NFT_BASE_CHAIN) - nft_unregister_basechain(nft_base_chain(chain), - afi->nops); + if (!nft_is_active_next(net, chain)) + continue; + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); } } @@ -656,7 +617,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) nft_trans_table_enable(trans) = false; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(ctx->afi, ctx->table); + ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table); if (ret >= 0) { ctx->table->flags &= ~NFT_TABLE_F_DORMANT; nft_trans_table_enable(trans) = true; @@ -678,6 +639,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); const struct nlattr *name; struct nft_af_info *afi; struct nft_table *table; @@ -691,7 +653,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, return PTR_ERR(afi); name = nla[NFTA_TABLE_NAME]; - table = nf_tables_table_lookup(afi, name); + table = nf_tables_table_lookup(afi, name, genmask); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); @@ -699,8 +661,6 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, } if (table != NULL) { - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) @@ -752,6 +712,9 @@ static int nft_flush_table(struct nft_ctx *ctx) struct nft_set *set, *ns; list_for_each_entry(chain, &ctx->table->chains, list) { + if (!nft_is_active_next(ctx->net, chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -760,6 +723,9 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(set, ns, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, set)) + continue; + if (set->flags & NFT_SET_ANONYMOUS && !list_empty(&set->bindings)) continue; @@ -770,6 +736,9 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { + if (!nft_is_active_next(ctx->net, chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -795,6 +764,9 @@ static int nft_flush(struct nft_ctx *ctx, int family) ctx->afi = afi; list_for_each_entry_safe(table, nt, &afi->tables, list) { + if (!nft_is_active_next(ctx->net, table)) + continue; + if (nla[NFTA_TABLE_NAME] && nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) continue; @@ -815,6 +787,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; int family = nfmsg->nfgen_family; @@ -828,7 +801,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -875,12 +848,14 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type); */ static struct nft_chain * -nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) +nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, + u8 genmask) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { - if (chain->handle == handle) + if (chain->handle == handle && + nft_active_genmask(chain, genmask)) return chain; } @@ -888,7 +863,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) } static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_chain *chain; @@ -896,7 +872,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, return ERR_PTR(-EINVAL); list_for_each_entry(chain, &table->chains, list) { - if (!nla_strcmp(nla, chain->name)) + if (!nla_strcmp(nla, chain->name) && + nft_active_genmask(chain, genmask)) return chain; } @@ -1079,6 +1056,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, chain)) + continue; if (nf_tables_fill_chain_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -1104,6 +1083,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; @@ -1122,17 +1102,13 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -1231,6 +1207,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; + u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct net_device *dev = NULL; u8 policy = NF_ACCEPT; @@ -1247,7 +1224,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1256,11 +1233,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle); + chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } else { - chain = nf_tables_chain_lookup(table, name); + chain = nf_tables_chain_lookup(table, name, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) return PTR_ERR(chain); @@ -1291,16 +1268,20 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct nft_stats *stats = NULL; struct nft_trans *trans; - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - if (nla[NFTA_CHAIN_HANDLE] && name && - !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]))) - return -EEXIST; + if (nla[NFTA_CHAIN_HANDLE] && name) { + struct nft_chain *chain2; + + chain2 = nf_tables_chain_lookup(table, + nla[NFTA_CHAIN_NAME], + genmask); + if (IS_ERR(chain2)) + return PTR_ERR(chain2); + } if (nla[NFTA_CHAIN_COUNTERS]) { if (!(chain->flags & NFT_BASE_CHAIN)) @@ -1424,7 +1405,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, rcu_assign_pointer(basechain->stats, stats); } - write_pnet(&basechain->pnet, net); basechain->type = type; chain = &basechain->chain; @@ -1455,7 +1435,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, chain->table = table; nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); - err = nf_tables_register_hooks(table, chain, afi->nops); + err = nf_tables_register_hooks(net, table, chain, afi->nops); if (err < 0) goto err1; @@ -1468,7 +1448,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: - nf_tables_unregister_hooks(table, chain, afi->nops); + nf_tables_unregister_hooks(net, table, chain, afi->nops); err1: nf_tables_chain_destroy(chain); return err; @@ -1479,6 +1459,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; @@ -1489,11 +1470,11 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (chain->use > 0) @@ -1878,10 +1859,16 @@ err: return err; } +struct nft_rule_dump_ctx { + char table[NFT_TABLE_MAXNAMELEN]; + char chain[NFT_CHAIN_MAXNAMELEN]; +}; + static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_rule_dump_ctx *ctx = cb->data; const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; @@ -1898,9 +1885,17 @@ static int nf_tables_dump_rules(struct sk_buff *skb, continue; list_for_each_entry_rcu(table, &afi->tables, list) { + if (ctx && ctx->table[0] && + strcmp(ctx->table, table->name) != 0) + continue; + list_for_each_entry_rcu(chain, &table->chains, list) { + if (ctx && ctx->chain[0] && + strcmp(ctx->chain, chain->name) != 0) + continue; + list_for_each_entry_rcu(rule, &chain->rules, list) { - if (!nft_rule_is_active(net, rule)) + if (!nft_is_active(net, rule)) goto cont; if (idx < s_idx) goto cont; @@ -1928,11 +1923,18 @@ done: return skb->len; } +static int nf_tables_dump_rules_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + static int nf_tables_getrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; @@ -1944,7 +1946,25 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_rules, + .done = nf_tables_dump_rules_done, }; + + if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { + struct nft_rule_dump_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (nla[NFTA_RULE_TABLE]) + nla_strlcpy(ctx->table, nla[NFTA_RULE_TABLE], + sizeof(ctx->table)); + if (nla[NFTA_RULE_CHAIN]) + nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN], + sizeof(ctx->chain)); + c.data = ctx; + } + return netlink_dump_start(nlsk, skb, nlh, &c); } @@ -1952,17 +1972,13 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) @@ -2011,6 +2027,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; @@ -2031,11 +2048,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); @@ -2104,7 +2121,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (rule == NULL) goto err1; - nft_rule_activate_next(net, rule); + nft_activate_next(net, rule); rule->handle = handle; rule->dlen = size; @@ -2126,14 +2143,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } if (nlh->nlmsg_flags & NLM_F_REPLACE) { - if (nft_rule_is_active_next(net, old_rule)) { + if (nft_is_active_next(net, old_rule)) { trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, old_rule); if (trans == NULL) { err = -ENOMEM; goto err2; } - nft_rule_deactivate_next(net, old_rule); + nft_deactivate_next(net, old_rule); chain->use--; list_add_tail_rcu(&rule->list, &old_rule->list); } else { @@ -2176,6 +2193,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain = NULL; @@ -2187,12 +2205,13 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); if (nla[NFTA_RULE_CHAIN]) { - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], + genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } @@ -2212,6 +2231,9 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, } } else { list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; + ctx.chain = chain; err = nft_delrule_by_chain(&ctx); if (err < 0) @@ -2341,7 +2363,8 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi = NULL; @@ -2357,7 +2380,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, if (afi == NULL) return -EAFNOSUPPORT; - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], + genmask); if (IS_ERR(table)) return PTR_ERR(table); } @@ -2367,7 +2391,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, } struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla) + const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -2375,22 +2399,27 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-EINVAL); list_for_each_entry(set, &table->sets, list) { - if (!nla_strcmp(nla, set->name)) + if (!nla_strcmp(nla, set->name) && + nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_set *set = nft_trans_set(trans); + if (trans->msg_type == NFT_MSG_NEWSET && - id == nft_trans_set_id(trans)) - return nft_trans_set(trans); + id == nft_trans_set_id(trans) && + nft_active_genmask(set, genmask)) + return set; } return ERR_PTR(-ENOENT); } @@ -2415,6 +2444,8 @@ cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; + if (!nft_is_active_next(ctx->net, set)) + continue; if (!sscanf(i->name, name, &tmp)) continue; if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) @@ -2434,6 +2465,8 @@ cont: snprintf(set->name, sizeof(set->name), name, min + n); list_for_each_entry(i, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, i)) + continue; if (!strcmp(set->name, i->name)) return -ENFILE; } @@ -2582,6 +2615,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) list_for_each_entry_rcu(set, &table->sets, list) { if (idx < s_idx) goto cont; + if (!nft_is_active(net, set)) + goto cont; ctx_set = *ctx; ctx_set.table = table; @@ -2618,6 +2653,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_ctx ctx; struct sk_buff *skb2; @@ -2625,7 +2661,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -2652,11 +2688,9 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) @@ -2695,6 +2729,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); const struct nft_set_ops *ops; struct nft_af_info *afi; struct nft_table *table; @@ -2792,13 +2827,13 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); - set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) return PTR_ERR(set); @@ -2845,7 +2880,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); - write_pnet(&set->pnet, net); set->ops = ops; set->ktype = ktype; set->klen = desc.klen; @@ -2897,6 +2931,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_set *set; struct nft_ctx ctx; int err; @@ -2906,11 +2941,11 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings)) @@ -2975,7 +3010,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, list_del_rcu(&binding->list); if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && - !(set->flags & NFT_SET_INACTIVE)) + nft_is_active(ctx->net, set)) nf_tables_set_destroy(ctx, set); } @@ -3031,7 +3066,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; @@ -3041,7 +3077,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -3138,6 +3175,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; @@ -3154,17 +3192,14 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) return err; err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, - (void *)nla); + (void *)nla, genmask); if (err < 0) return err; - if (ctx.table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; event = NFT_MSG_NEWSETELEM; event |= NFNL_SUBSYS_NFTABLES << 8; @@ -3218,21 +3253,19 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; - if (ctx.table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -3525,7 +3558,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err4; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(set, &elem); + err = set->ops->insert(ctx->net, set, &elem); if (err < 0) goto err5; @@ -3550,6 +3583,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3558,15 +3592,17 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) { if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { set = nf_tables_set_lookup_byid(net, - nla[NFTA_SET_ELEM_LIST_SET_ID]); + nla[NFTA_SET_ELEM_LIST_SET_ID], + genmask); } if (IS_ERR(set)) return PTR_ERR(set); @@ -3646,7 +3682,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, goto err3; } - priv = set->ops->deactivate(set, &elem); + priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; goto err4; @@ -3672,6 +3708,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3680,11 +3717,12 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) @@ -3952,36 +3990,40 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { if (!nft_trans_table_enable(trans)) { - nf_tables_table_disable(trans->ctx.afi, + nf_tables_table_disable(net, + trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } } else { - trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; + nft_clear(net, trans->ctx.table); } nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: + list_del_rcu(&trans->ctx.table->list); nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) nft_chain_commit_update(trans); else - trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + nft_clear(net, trans->ctx.chain); nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); nft_trans_destroy(trans); break; case NFT_MSG_DELCHAIN: + list_del_rcu(&trans->ctx.chain->list); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); - nf_tables_unregister_hooks(trans->ctx.table, + nf_tables_unregister_hooks(trans->ctx.net, + trans->ctx.table, trans->ctx.chain, trans->ctx.afi->nops); break; case NFT_MSG_NEWRULE: - nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_clear(trans->ctx.net, nft_trans_rule(trans)); nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), NFT_MSG_NEWRULE); @@ -3994,7 +4036,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NFT_MSG_DELRULE); break; case NFT_MSG_NEWSET: - nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; + nft_clear(net, nft_trans_set(trans)); /* This avoids hitting -EBUSY when deleting the table * from the transaction. */ @@ -4007,13 +4049,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_DELSET, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->activate(te->set, &te->elem); + te->set->ops->activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM, 0); @@ -4078,7 +4121,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { if (nft_trans_table_enable(trans)) { - nf_tables_table_disable(trans->ctx.afi, + nf_tables_table_disable(net, + trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -4088,8 +4132,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELTABLE: - list_add_tail_rcu(&trans->ctx.table->list, - &trans->ctx.afi->tables); + nft_clear(trans->ctx.net, trans->ctx.table); nft_trans_destroy(trans); break; case NFT_MSG_NEWCHAIN: @@ -4100,15 +4143,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) } else { trans->ctx.table->use--; list_del_rcu(&trans->ctx.chain->list); - nf_tables_unregister_hooks(trans->ctx.table, + nf_tables_unregister_hooks(trans->ctx.net, + trans->ctx.table, trans->ctx.chain, trans->ctx.afi->nops); } break; case NFT_MSG_DELCHAIN: trans->ctx.table->use++; - list_add_tail_rcu(&trans->ctx.chain->list, - &trans->ctx.table->chains); + nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: @@ -4117,7 +4160,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELRULE: trans->ctx.chain->use++; - nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: @@ -4126,8 +4169,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELSET: trans->ctx.table->use++; - list_add_tail_rcu(&nft_trans_set(trans)->list, - &trans->ctx.table->sets); + nft_clear(trans->ctx.net, nft_trans_set(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -4139,7 +4181,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->activate(te->set, &te->elem); + te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; nft_trans_destroy(trans); @@ -4274,6 +4316,8 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, } list_for_each_entry(set, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, set)) + continue; if (!(set->flags & NFT_SET_MAP) || set->dtype != NFT_DATA_VERDICT) continue; @@ -4432,6 +4476,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { + u8 genmask = nft_genmask_next(ctx->net); struct nlattr *tb[NFTA_VERDICT_MAX + 1]; struct nft_chain *chain; int err; @@ -4464,7 +4509,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; chain = nf_tables_chain_lookup(ctx->table, - tb[NFTA_VERDICT_CHAIN]); + tb[NFTA_VERDICT_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (chain->flags & NFT_BASE_CHAIN) @@ -4642,7 +4687,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) BUG_ON(!(ctx->chain->flags & NFT_BASE_CHAIN)); - nf_tables_unregister_hooks(ctx->chain->table, ctx->chain, + nf_tables_unregister_hooks(ctx->net, ctx->chain->table, ctx->chain, ctx->afi->nops); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); @@ -4671,7 +4716,8 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) list_for_each_entry_safe(table, nt, &afi->tables, list) { list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hooks(table, chain, afi->nops); + nf_tables_unregister_hooks(net, table, chain, + afi->nops); /* No packets are walking on these chains anymore. */ ctx.table = table; list_for_each_entry(chain, &table->chains, list) { diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 5eefe4a35..75d696f11 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -30,7 +30,6 @@ nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, if (!iph) return; - iph = ip_hdr(skb); if (iph->ihl < 5 || iph->version != 4) return; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 39eb1cc62..fa24a5b39 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -237,7 +237,7 @@ void nft_trace_notify(struct nft_traceinfo *info) break; case NFT_TRACETYPE_POLICY: if (nla_put_be32(skb, NFTA_TRACE_POLICY, - info->basechain->policy)) + htonl(info->basechain->policy))) goto nla_put_failure; break; } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 1b4de4bd6..d44d89b56 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -326,14 +326,14 @@ static int nfnl_acct_try_del(struct nf_acct *cur) { int ret = 0; - /* we want to avoid races with nfnl_acct_find_get. */ - if (atomic_dec_and_test(&cur->refcnt)) { + /* We want to avoid races with nfnl_acct_put. So only when the current + * refcnt is 1, we decrease it to 0. + */ + if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) { /* We are protected by nfnl mutex. */ list_del_rcu(&cur->head); kfree_rcu(cur, rcu_head); } else { - /* still in use, restore reference counter. */ - atomic_inc(&cur->refcnt); ret = -EBUSY; } return ret; @@ -343,12 +343,12 @@ static int nfnl_acct_del(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { - char *acct_name; - struct nf_acct *cur; + struct nf_acct *cur, *tmp; int ret = -ENOENT; + char *acct_name; if (!tb[NFACCT_NAME]) { - list_for_each_entry(cur, &net->nfnl_acct_list, head) + list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; @@ -443,7 +443,7 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) } EXPORT_SYMBOL_GPL(nfnl_acct_update); -static void nfnl_overquota_report(struct nf_acct *nfacct) +static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct) { int ret; struct sk_buff *skb; @@ -458,11 +458,12 @@ static void nfnl_overquota_report(struct nf_acct *nfacct) kfree_skb(skb); return; } - netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, + netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC); } -int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) +int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb, + struct nf_acct *nfacct) { u64 now; u64 *quota; @@ -480,7 +481,7 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) if (now >= *quota && !test_and_set_bit(NFACCT_OVERQUOTA_BIT, &nfacct->flags)) { - nfnl_overquota_report(nfacct); + nfnl_overquota_report(net, nfacct); } return ret; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 3c84f1432..139e0867e 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -98,31 +98,28 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, break; } - l4proto = nf_ct_l4proto_find_get(l3num, l4num); - - /* This protocol is not supportted, skip. */ - if (l4proto->l4proto != l4num) { - ret = -EOPNOTSUPP; - goto err_proto_put; - } - if (matching) { if (nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ if (matching->l3num != l3num || - matching->l4proto->l4proto != l4num) { - ret = -EINVAL; - goto err_proto_put; - } - - ret = ctnl_timeout_parse_policy(&matching->data, - l4proto, net, - cda[CTA_TIMEOUT_DATA]); - return ret; + matching->l4proto->l4proto != l4num) + return -EINVAL; + + return ctnl_timeout_parse_policy(&matching->data, + matching->l4proto, net, + cda[CTA_TIMEOUT_DATA]); } - ret = -EBUSY; + + return -EBUSY; + } + + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supportted, skip. */ + if (l4proto->l4proto != l4num) { + ret = -EOPNOTSUPP; goto err_proto_put; } @@ -303,16 +300,33 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) { struct nf_conntrack_tuple_hash *h; const struct hlist_nulls_node *nn; - int i; + unsigned int last_hsize; + spinlock_t *lock; + int i, cpu; + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) + untimeout(h, timeout); + spin_unlock_bh(&pcpu->lock); + } local_bh_disable(); - for (i = 0; i < nf_conntrack_htable_size; i++) { - nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) - untimeout(h, timeout); +restart: + last_hsize = nf_conntrack_htable_size; + for (i = 0; i < last_hsize; i++) { + lock = &nf_conntrack_locks[i % CONNTRACK_LOCKS]; + nf_conntrack_lock(lock); + if (last_hsize != nf_conntrack_htable_size) { + spin_unlock(lock); + goto restart; } - spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) + untimeout(h, timeout); + spin_unlock(lock); } local_bh_enable(); } @@ -322,16 +336,16 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) { int ret = 0; - /* we want to avoid races with nf_ct_timeout_find_get. */ - if (atomic_dec_and_test(&timeout->refcnt)) { + /* We want to avoid races with ctnl_timeout_put. So only when the + * current refcnt is 1, we decrease it to 0. + */ + if (atomic_cmpxchg(&timeout->refcnt, 1, 0) == 1) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_l4proto_put(timeout->l4proto); ctnl_untimeout(net, timeout); kfree_rcu(timeout, rcu_head); } else { - /* still in use, restore reference counter. */ - atomic_inc(&timeout->refcnt); ret = -EBUSY; } return ret; @@ -342,12 +356,13 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, const struct nlmsghdr *nlh, const struct nlattr * const cda[]) { - struct ctnl_timeout *cur; + struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; if (!cda[CTA_TIMEOUT_NAME]) { - list_for_each_entry(cur, &net->nfct_timeout_list, head) + list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, + head) ctnl_timeout_try_del(net, cur); return 0; @@ -535,7 +550,9 @@ err: static void ctnl_timeout_put(struct ctnl_timeout *timeout) { - atomic_dec(&timeout->refcnt); + if (atomic_dec_and_test(&timeout->refcnt)) + kfree_rcu(timeout, rcu_head); + module_put(THIS_MODULE); } #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ @@ -583,7 +600,9 @@ static void __net_exit cttimeout_net_exit(struct net *net) list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { list_del_rcu(&cur->head); nf_ct_l4proto_put(cur->l4proto); - kfree_rcu(cur, rcu_head); + + if (atomic_dec_and_test(&cur->refcnt)) + kfree_rcu(cur, rcu_head); } } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 11f81c838..6577db524 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -700,10 +700,13 @@ nfulnl_log_packet(struct net *net, break; case NFULNL_COPY_PACKET: - if (inst->copy_range > skb->len) + data_len = inst->copy_range; + if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) && + (li->u.ulog.copy_len < data_len)) + data_len = li->u.ulog.copy_len; + + if (data_len > skb->len) data_len = skb->len; - else - data_len = inst->copy_range; size += nla_total_size(data_len); break; @@ -1144,6 +1147,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); MODULE_ALIAS_NF_LOGGER(AF_INET, 1); MODULE_ALIAS_NF_LOGGER(AF_INET6, 1); MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1); +MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */ module_init(nfnetlink_log_init); module_exit(nfnetlink_log_fini); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 5d36a0926..f49f45081 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1145,10 +1145,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, struct nfnl_queue_net *q = nfnl_queue_pernet(net); int err; - queue = instance_lookup(q, queue_num); - if (!queue) - queue = verdict_instance_lookup(q, queue_num, - NETLINK_CB(skb).portid); + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 6228c422c..c21e7eb8d 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -23,6 +23,20 @@ #include #include +struct nft_xt { + struct list_head head; + struct nft_expr_ops ops; + unsigned int refcnt; +}; + +static void nft_xt_put(struct nft_xt *xt) +{ + if (--xt->refcnt == 0) { + list_del(&xt->head); + kfree(xt); + } +} + static int nft_compat_chain_validate_dependency(const char *tablename, const struct nft_chain *chain) { @@ -260,6 +274,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.target->destroy != NULL) par.target->destroy(&par); + nft_xt_put(container_of(expr->ops, struct nft_xt, ops)); module_put(target->me); } @@ -442,6 +457,7 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.match->destroy != NULL) par.match->destroy(&par); + nft_xt_put(container_of(expr->ops, struct nft_xt, ops)); module_put(match->me); } @@ -612,11 +628,6 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = { static LIST_HEAD(nft_match_list); -struct nft_xt { - struct list_head head; - struct nft_expr_ops ops; -}; - static struct nft_expr_type nft_match_type; static bool nft_match_cmp(const struct xt_match *match, @@ -634,6 +645,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct xt_match *match; char *mt_name; u32 rev, family; + int err; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || @@ -652,6 +664,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, if (!try_module_get(match->me)) return ERR_PTR(-ENOENT); + nft_match->refcnt++; return &nft_match->ops; } } @@ -660,14 +673,19 @@ nft_match_select_ops(const struct nft_ctx *ctx, if (IS_ERR(match)) return ERR_PTR(-ENOENT); - if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) - return ERR_PTR(-EINVAL); + if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) { + err = -EINVAL; + goto err; + } /* This is the first time we use this match, allocate operations */ nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); - if (nft_match == NULL) - return ERR_PTR(-ENOMEM); + if (nft_match == NULL) { + err = -ENOMEM; + goto err; + } + nft_match->refcnt = 1; nft_match->ops.type = &nft_match_type; nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); nft_match->ops.eval = nft_match_eval; @@ -680,14 +698,9 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_add(&nft_match->head, &nft_match_list); return &nft_match->ops; -} - -static void nft_match_release(void) -{ - struct nft_xt *nft_match, *tmp; - - list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head) - kfree(nft_match); +err: + module_put(match->me); + return ERR_PTR(err); } static struct nft_expr_type nft_match_type __read_mostly = { @@ -717,6 +730,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct xt_target *target; char *tg_name; u32 rev, family; + int err; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || @@ -735,6 +749,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, if (!try_module_get(target->me)) return ERR_PTR(-ENOENT); + nft_target->refcnt++; return &nft_target->ops; } } @@ -743,14 +758,19 @@ nft_target_select_ops(const struct nft_ctx *ctx, if (IS_ERR(target)) return ERR_PTR(-ENOENT); - if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) - return ERR_PTR(-EINVAL); + if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) { + err = -EINVAL; + goto err; + } /* This is the first time we use this target, allocate operations */ nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); - if (nft_target == NULL) - return ERR_PTR(-ENOMEM); + if (nft_target == NULL) { + err = -ENOMEM; + goto err; + } + nft_target->refcnt = 1; nft_target->ops.type = &nft_target_type; nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); nft_target->ops.init = nft_target_init; @@ -767,14 +787,9 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_add(&nft_target->head, &nft_target_list); return &nft_target->ops; -} - -static void nft_target_release(void) -{ - struct nft_xt *nft_target, *tmp; - - list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head) - kfree(nft_target); +err: + module_put(target->me); + return ERR_PTR(err); } static struct nft_expr_type nft_target_type __read_mostly = { @@ -819,8 +834,6 @@ static void __exit nft_compat_module_exit(void) nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); - nft_match_release(); - nft_target_release(); } MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 81fbb4507..51e180f2a 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -109,18 +109,11 @@ static void nft_ct_get_eval(const struct nft_expr *expr, #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - unsigned int size; - if (!labels) { + if (labels) + memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE); + else memset(dest, 0, NF_CT_LABELS_MAX_SIZE); - return; - } - - size = labels->words * sizeof(long); - memcpy(dest, labels->bits, size); - if (size < NF_CT_LABELS_MAX_SIZE) - memset(((char *) dest) + size, 0, - NF_CT_LABELS_MAX_SIZE - size); return; } #endif @@ -351,6 +344,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS) + nf_ct_set_acct(ctx->net, true); + return 0; } @@ -359,6 +355,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); + bool label_got = false; unsigned int len; int err; @@ -377,6 +374,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); if (err) return err; + label_got = true; break; #endif default: @@ -386,17 +384,28 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); err = nft_validate_register_load(priv->sreg, len); if (err < 0) - return err; + goto err1; err = nft_ct_l3proto_try_module_get(ctx->afi->family); if (err < 0) - return err; + goto err1; return 0; + +err1: + if (label_got) + nf_connlabels_put(ctx->net); + return err; +} + +static void nft_ct_get_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nft_ct_l3proto_module_put(ctx->afi->family); } -static void nft_ct_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_ct_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { struct nft_ct *priv = nft_expr_priv(expr); @@ -468,7 +477,7 @@ static const struct nft_expr_ops nft_ct_get_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_eval, .init = nft_ct_get_init, - .destroy = nft_ct_destroy, + .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, }; @@ -477,7 +486,7 @@ static const struct nft_expr_ops nft_ct_set_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_eval, .init = nft_ct_set_init, - .destroy = nft_ct_destroy, + .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, }; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 78d4914fb..0af26699b 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -103,6 +103,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_dynset *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; int err; @@ -112,11 +113,13 @@ static int nft_dynset_init(const struct nft_ctx *ctx, tb[NFTA_DYNSET_SREG_KEY] == NULL) return -EINVAL; - set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]); + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME], + genmask); if (IS_ERR(set)) { if (tb[NFTA_DYNSET_SET_ID]) set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_DYNSET_SET_ID]); + tb[NFTA_DYNSET_SET_ID], + genmask); if (IS_ERR(set)) return PTR_ERR(set); } diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index ba7aed13e..82c264e40 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -59,6 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); + u32 offset, len; if (tb[NFTA_EXTHDR_DREG] == NULL || tb[NFTA_EXTHDR_TYPE] == NULL || @@ -66,9 +67,15 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, tb[NFTA_EXTHDR_LEN] == NULL) return -EINVAL; + offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); + len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); + + if (offset > U8_MAX || len > U8_MAX) + return -ERANGE; + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); - priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); - priv->len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); + priv->offset = offset; + priv->len = len; priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); return nft_validate_register_store(ctx, priv->dreg, NULL, diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index f39c53a15..564fa7929 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -71,13 +71,13 @@ static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, return 0; } -static bool nft_hash_lookup(const struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext) +static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); const struct nft_hash_elem *he; struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_cur(read_pnet(&set->pnet)), + .genmask = nft_genmask_cur(net), .set = set, .key = key, }; @@ -125,13 +125,13 @@ err1: return false; } -static int nft_hash_insert(const struct nft_set *set, +static int nft_hash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he = elem->priv; struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_next(read_pnet(&set->pnet)), + .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, }; @@ -140,22 +140,23 @@ static int nft_hash_insert(const struct nft_set *set, nft_hash_params); } -static void nft_hash_activate(const struct nft_set *set, +static void nft_hash_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash_elem *he = elem->priv; - nft_set_elem_change_active(set, &he->ext); + nft_set_elem_change_active(net, set, &he->ext); nft_set_elem_clear_busy(&he->ext); } -static void *nft_hash_deactivate(const struct nft_set *set, +static void *nft_hash_deactivate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_next(read_pnet(&set->pnet)), + .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, }; @@ -163,8 +164,9 @@ static void *nft_hash_deactivate(const struct nft_set *set, rcu_read_lock(); he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); if (he != NULL) { - if (!nft_set_elem_mark_busy(&he->ext)) - nft_set_elem_change_active(set, &he->ext); + if (!nft_set_elem_mark_busy(&he->ext) || + !nft_is_active(net, &he->ext)) + nft_set_elem_change_active(net, set, &he->ext); else he = NULL; } diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 319c22b4b..24a73bb26 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -52,7 +52,14 @@ static int nft_log_init(const struct nft_ctx *ctx, struct nft_log *priv = nft_expr_priv(expr); struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; - int ret; + int err; + + li->type = NF_LOG_TYPE_LOG; + if (tb[NFTA_LOG_LEVEL] != NULL && + tb[NFTA_LOG_GROUP] != NULL) + return -EINVAL; + if (tb[NFTA_LOG_GROUP] != NULL) + li->type = NF_LOG_TYPE_ULOG; nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { @@ -64,13 +71,6 @@ static int nft_log_init(const struct nft_ctx *ctx, priv->prefix = (char *)nft_log_null_prefix; } - li->type = NF_LOG_TYPE_LOG; - if (tb[NFTA_LOG_LEVEL] != NULL && - tb[NFTA_LOG_GROUP] != NULL) - return -EINVAL; - if (tb[NFTA_LOG_GROUP] != NULL) - li->type = NF_LOG_TYPE_ULOG; - switch (li->type) { case NF_LOG_TYPE_LOG: if (tb[NFTA_LOG_LEVEL] != NULL) { @@ -79,6 +79,11 @@ static int nft_log_init(const struct nft_ctx *ctx, } else { li->u.log.level = LOGLEVEL_WARNING; } + if (li->u.log.level > LOGLEVEL_DEBUG) { + err = -EINVAL; + goto err1; + } + if (tb[NFTA_LOG_FLAGS] != NULL) { li->u.log.logflags = ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); @@ -87,6 +92,7 @@ static int nft_log_init(const struct nft_ctx *ctx, case NF_LOG_TYPE_ULOG: li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); if (tb[NFTA_LOG_SNAPLEN] != NULL) { + li->u.ulog.flags |= NF_LOG_F_COPY_LEN; li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); } @@ -97,20 +103,16 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } - if (ctx->afi->family == NFPROTO_INET) { - ret = nf_logger_find_get(NFPROTO_IPV4, li->type); - if (ret < 0) - return ret; + err = nf_logger_find_get(ctx->afi->family, li->type); + if (err < 0) + goto err1; - ret = nf_logger_find_get(NFPROTO_IPV6, li->type); - if (ret < 0) { - nf_logger_put(NFPROTO_IPV4, li->type); - return ret; - } - return 0; - } + return 0; - return nf_logger_find_get(ctx->afi->family, li->type); +err1: + if (priv->prefix != nft_log_null_prefix) + kfree(priv->prefix); + return err; } static void nft_log_destroy(const struct nft_ctx *ctx, @@ -122,12 +124,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx, if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); - if (ctx->afi->family == NFPROTO_INET) { - nf_logger_put(NFPROTO_IPV4, li->type); - nf_logger_put(NFPROTO_IPV6, li->type); - } else { - nf_logger_put(ctx->afi->family, li->type); - } + nf_logger_put(ctx->afi->family, li->type); } static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -153,7 +150,7 @@ static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) goto nla_put_failure; - if (li->u.ulog.copy_len) { + if (li->u.ulog.flags & NF_LOG_F_COPY_LEN) { if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, htonl(li->u.ulog.copy_len))) goto nla_put_failure; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index b3c31ef80..e164325d1 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -22,6 +22,7 @@ struct nft_lookup { struct nft_set *set; enum nft_registers sreg:8; enum nft_registers dreg:8; + bool invert; struct nft_set_binding binding; }; @@ -32,14 +33,20 @@ static void nft_lookup_eval(const struct nft_expr *expr, const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; const struct nft_set_ext *ext; + bool found; - if (set->ops->lookup(set, ®s->data[priv->sreg], &ext)) { - if (set->flags & NFT_SET_MAP) - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), set->dlen); + found = set->ops->lookup(pkt->net, set, ®s->data[priv->sreg], &ext) ^ + priv->invert; + + if (!found) { + regs->verdict.code = NFT_BREAK; return; } - regs->verdict.code = NFT_BREAK; + + if (found && set->flags & NFT_SET_MAP) + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), set->dlen); + } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { @@ -47,6 +54,7 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, + [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 }, }; static int nft_lookup_init(const struct nft_ctx *ctx, @@ -54,18 +62,21 @@ static int nft_lookup_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_lookup *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; + u32 flags; int err; if (tb[NFTA_LOOKUP_SET] == NULL || tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; - set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); + set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET], genmask); if (IS_ERR(set)) { if (tb[NFTA_LOOKUP_SET_ID]) { set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_LOOKUP_SET_ID]); + tb[NFTA_LOOKUP_SET_ID], + genmask); } if (IS_ERR(set)) return PTR_ERR(set); @@ -79,7 +90,22 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (tb[NFTA_LOOKUP_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); + + if (flags & ~NFT_LOOKUP_F_INV) + return -EINVAL; + + if (flags & NFT_LOOKUP_F_INV) { + if (set->flags & NFT_SET_MAP) + return -EINVAL; + priv->invert = true; + } + } + if (tb[NFTA_LOOKUP_DREG] != NULL) { + if (priv->invert) + return -EINVAL; if (!(set->flags & NFT_SET_MAP)) return -EINVAL; @@ -112,6 +138,7 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx, static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); + u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0; if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) goto nla_put_failure; @@ -120,6 +147,8 @@ static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) if (priv->set->flags & NFT_SET_MAP) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) + goto nla_put_failure; return 0; nla_put_failure: diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index f4bad9dc1..8a6bc7630 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -199,13 +199,6 @@ err: } EXPORT_SYMBOL_GPL(nft_meta_get_eval); -/* don't change or set _LOOPBACK, _USER, etc. */ -static bool pkt_type_ok(u32 p) -{ - return p == PACKET_HOST || p == PACKET_BROADCAST || - p == PACKET_MULTICAST || p == PACKET_OTHERHOST; -} - void nft_meta_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -223,7 +216,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, break; case NFT_META_PKTTYPE: if (skb->pkt_type != value && - pkt_type_ok(value) && pkt_type_ok(skb->pkt_type)) + skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type)) skb->pkt_type = value; break; case NFT_META_NFTRACE: @@ -298,10 +291,16 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx) +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { + struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; + if (priv->key != NFT_META_PKTTYPE) + return 0; + switch (ctx->afi->family) { case NFPROTO_BRIDGE: hooks = 1 << NF_BR_PRE_ROUTING; @@ -315,6 +314,7 @@ static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx) return nft_chain_validate_hooks(ctx->chain, hooks); } +EXPORT_SYMBOL_GPL(nft_meta_set_validate); int nft_meta_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -334,15 +334,16 @@ int nft_meta_set_init(const struct nft_ctx *ctx, len = sizeof(u8); break; case NFT_META_PKTTYPE: - err = nft_meta_set_init_pkttype(ctx); - if (err) - return err; len = sizeof(u8); break; default: return -EOPNOTSUPP; } + err = nft_meta_set_validate(ctx, expr, NULL); + if (err < 0) + return err; + priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); err = nft_validate_register_load(priv->sreg, len); if (err < 0) @@ -414,6 +415,7 @@ static const struct nft_expr_ops nft_meta_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index 7201d57b5..ffe9ae062 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -41,13 +41,13 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this, return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; } -static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext) +static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { const struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; + u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); const void *this; int d; @@ -70,7 +70,6 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, } else if (d > 0) parent = parent->rb_right; else { -found: if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; continue; @@ -84,22 +83,25 @@ found: } } - if (set->flags & NFT_SET_INTERVAL && interval != NULL) { - rbe = interval; - goto found; + if (set->flags & NFT_SET_INTERVAL && interval != NULL && + nft_set_elem_active(&interval->ext, genmask) && + !nft_rbtree_interval_end(interval)) { + spin_unlock_bh(&nft_rbtree_lock); + *ext = &interval->ext; + return true; } out: spin_unlock_bh(&nft_rbtree_lock); return false; } -static int __nft_rbtree_insert(const struct nft_set *set, +static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new) { struct nft_rbtree *priv = nft_set_priv(set); + u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; - u8 genmask = nft_genmask_next(read_pnet(&set->pnet)); int d; parent = NULL; @@ -132,14 +134,14 @@ static int __nft_rbtree_insert(const struct nft_set *set, return 0; } -static int nft_rbtree_insert(const struct nft_set *set, +static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe = elem->priv; int err; spin_lock_bh(&nft_rbtree_lock); - err = __nft_rbtree_insert(set, rbe); + err = __nft_rbtree_insert(net, set, rbe); spin_unlock_bh(&nft_rbtree_lock); return err; @@ -156,21 +158,23 @@ static void nft_rbtree_remove(const struct nft_set *set, spin_unlock_bh(&nft_rbtree_lock); } -static void nft_rbtree_activate(const struct nft_set *set, +static void nft_rbtree_activate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe = elem->priv; - nft_set_elem_change_active(set, &rbe->ext); + nft_set_elem_change_active(net, set, &rbe->ext); } -static void *nft_rbtree_deactivate(const struct nft_set *set, +static void *nft_rbtree_deactivate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; struct nft_rbtree_elem *rbe, *this = elem->priv; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); + u8 genmask = nft_genmask_next(net); int d; while (parent != NULL) { @@ -196,7 +200,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, parent = parent->rb_right; continue; } - nft_set_elem_change_active(set, &rbe->ext); + nft_set_elem_change_active(net, set, &rbe->ext); return rbe; } } diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 0522fc9bf..c64de3f73 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -26,11 +26,27 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { }; EXPORT_SYMBOL_GPL(nft_reject_policy); +int nft_reject_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT)); +} +EXPORT_SYMBOL_GPL(nft_reject_validate); + int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); + int err; + + err = nft_reject_validate(ctx, expr, NULL); + if (err < 0) + return err; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 759ca5248..e79d9ca2f 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -66,7 +66,11 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code; + int icmp_code, err; + + err = nft_reject_validate(ctx, expr, NULL); + if (err < 0) + return err; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; @@ -124,6 +128,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .eval = nft_reject_inet_eval, .init = nft_reject_inet_init, .dump = nft_reject_inet_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2675d580c..e0aa7c1d0 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -702,6 +702,56 @@ int xt_check_entry_offsets(const void *base, } EXPORT_SYMBOL(xt_check_entry_offsets); +/** + * xt_alloc_entry_offsets - allocate array to store rule head offsets + * + * @size: number of entries + * + * Return: NULL or kmalloc'd or vmalloc'd array + */ +unsigned int *xt_alloc_entry_offsets(unsigned int size) +{ + unsigned int *off; + + off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN); + + if (off) + return off; + + if (size < (SIZE_MAX / sizeof(unsigned int))) + off = vmalloc(size * sizeof(unsigned int)); + + return off; +} +EXPORT_SYMBOL(xt_alloc_entry_offsets); + +/** + * xt_find_jump_offset - check if target is a valid jump offset + * + * @offsets: array containing all valid rule start offsets of a rule blob + * @target: the jump target to search for + * @size: entries in @offset + */ +bool xt_find_jump_offset(const unsigned int *offsets, + unsigned int target, unsigned int size) +{ + int m, low = 0, hi = size; + + while (hi > low) { + m = (low + hi) / 2u; + + if (offsets[m] > target) + hi = m; + else if (offsets[m] < target) + low = m + 1; + else + return true; + } + + return false; +} +EXPORT_SYMBOL(xt_find_jump_offset); + int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { @@ -1460,6 +1510,9 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) uint8_t hooknum; struct nf_hook_ops *ops; + if (!num_hooks) + return ERR_PTR(-EINVAL); + ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index a1fa2c800..018eed7e1 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -33,6 +33,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; + if (info->flags & XT_NFLOG_F_COPY_LEN) + li.u.ulog.flags |= NF_LOG_F_COPY_LEN; + nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, &li, info->prefix); return XT_CONTINUE; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 604df6fae..515131f9e 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -137,7 +137,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rstats, - &est->lock, &cfg.opt); + &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 7f4414d26..663c4c3c9 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, daddr, dport, in->ifindex); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; /* NOTE: we return listeners even if bound to * 0.0.0.0, those are filtered out in * xt_socket, since xt_TPROXY needs 0 bound @@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, daddr, ntohs(dport), in->ifindex); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; /* NOTE: we return listeners even if bound to * 0.0.0.0, those are filtered out in * xt_socket, since xt_TPROXY needs 0 bound diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index df48967af..858d189a1 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -4,12 +4,23 @@ #include #include +#include MODULE_DESCRIPTION("Xtables: packet flow tracing"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TRACE"); MODULE_ALIAS("ip6t_TRACE"); +static int trace_tg_check(const struct xt_tgchk_param *par) +{ + return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); +} + +static void trace_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_logger_put(par->family, NF_LOG_TYPE_LOG); +} + static unsigned int trace_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -18,12 +29,14 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par) } static struct xt_target trace_tg_reg __read_mostly = { - .name = "TRACE", - .revision = 0, - .family = NFPROTO_UNSPEC, - .table = "raw", - .target = trace_tg, - .me = THIS_MODULE, + .name = "TRACE", + .revision = 0, + .family = NFPROTO_UNSPEC, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, }; static int __init trace_tg_init(void) diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index a79af2555..03d66f1c5 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -18,21 +19,12 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); MODULE_ALIAS("ipt_connlabel"); MODULE_ALIAS("ip6t_connlabel"); -static bool connlabel_match(const struct nf_conn *ct, u16 bit) -{ - struct nf_conn_labels *labels = nf_ct_labels_find(ct); - - if (!labels) - return false; - - return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits); -} - static bool connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connlabel_mtinfo *info = par->matchinfo; enum ip_conntrack_info ctinfo; + struct nf_conn_labels *labels; struct nf_conn *ct; bool invert = info->options & XT_CONNLABEL_OP_INVERT; @@ -40,10 +32,21 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL || nf_ct_is_untracked(ct)) return invert; - if (info->options & XT_CONNLABEL_OP_SET) - return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; + labels = nf_ct_labels_find(ct); + if (!labels) + return invert; + + if (test_bit(info->bit, labels->bits)) + return !invert; + + if (info->options & XT_CONNLABEL_OP_SET) { + if (!test_and_set_bit(info->bit, labels->bits)) + nf_conntrack_event_cache(IPCT_LABEL, ct); + + return !invert; + } - return connlabel_match(ct, info->bit) ^ invert; + return invert; } static int connlabel_mt_check(const struct xt_mtchk_param *par) diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 3048a7e3a..cf3275938 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -26,7 +26,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) nfnl_acct_update(skb, info->nfacct); - overquota = nfnl_acct_overquota(skb, info->nfacct); + overquota = nfnl_acct_overquota(par->net, skb, info->nfacct); return overquota == NFACCT_UNDERQUOTA ? false : true; } diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 1302b475a..a20e731b5 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -21,11 +21,39 @@ static int owner_check(const struct xt_mtchk_param *par) { struct xt_owner_match_info *info = par->matchinfo; + struct net *net = par->net; - /* For now only allow adding matches from the initial user namespace */ + /* Only allow the common case where the userns of the writer + * matches the userns of the network namespace. + */ if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && - (current_user_ns() != &init_user_ns)) + (current_user_ns() != net->user_ns)) return -EINVAL; + + /* Ensure the uids are valid */ + if (info->match & XT_OWNER_UID) { + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); + + if (!uid_valid(uid_min) || !uid_valid(uid_max) || + (info->uid_max < info->uid_min) || + uid_lt(uid_max, uid_min)) { + return -EINVAL; + } + } + + /* Ensure the gids are valid */ + if (info->match & XT_OWNER_GID) { + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); + + if (!gid_valid(gid_min) || !gid_valid(gid_max) || + (info->gid_max < info->gid_min) || + gid_lt(gid_max, gid_min)) { + return -EINVAL; + } + } + return 0; } @@ -35,6 +63,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; struct sock *sk = skb_to_full_sk(skb); + struct net *net = par->net; if (sk == NULL || sk->sk_socket == NULL) return (info->match ^ info->invert) == 0; @@ -51,8 +80,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) (XT_OWNER_UID | XT_OWNER_GID)) == 0; if (info->match & XT_OWNER_UID) { - kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); - kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ !(info->invert & XT_OWNER_UID)) @@ -60,8 +89,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { - kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); - kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_OWNER_GID)) diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 1caaccbc3..e5f18988a 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -102,14 +102,14 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) return -EINVAL; - if (info->bitmask & XT_PHYSDEV_OP_OUT && + if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { - pr_info("using --physdev-out in the OUTPUT, FORWARD and " - "POSTROUTING chains for non-bridged traffic is not " - "supported anymore.\n"); + pr_info("using --physdev-out and --physdev-is-out are only" + "supported in the FORWARD and POSTROUTING chains with" + "bridged traffic.\n"); if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) return -EINVAL; } diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index c14d4645d..ade024c90 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -83,8 +83,6 @@ static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; } -#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg))) - th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we @@ -102,9 +100,8 @@ static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) ntohs(th->dest), !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) return false; - if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) - == tcpinfo->flg_cmp, - XT_TCP_INV_FLAGS)) + if (!NF_INVF(tcpinfo, XT_TCP_INV_FLAGS, + (((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp)) return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig index 56958c85f..d9eaa30ff 100644 --- a/net/netlabel/Kconfig +++ b/net/netlabel/Kconfig @@ -5,6 +5,7 @@ config NETLABEL bool "NetLabel subsystem support" depends on SECURITY + select CRC_CCITT if IPV6 default n ---help--- NetLabel provides support for explicit network packet labeling diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile index d2732fc95..d341ede0d 100644 --- a/net/netlabel/Makefile +++ b/net/netlabel/Makefile @@ -12,4 +12,4 @@ obj-y += netlabel_mgmt.o # protocol modules obj-y += netlabel_unlabeled.o obj-y += netlabel_cipso_v4.o - +obj-$(subst m,y,$(CONFIG_IPV6)) += netlabel_calipso.o diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c new file mode 100644 index 000000000..2ec93c5e7 --- /dev/null +++ b/net/netlabel/netlabel_calipso.c @@ -0,0 +1,740 @@ +/* + * NetLabel CALIPSO/IPv6 Support + * + * This file defines the CALIPSO/IPv6 functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and CALIPSO. + * + * Authors: Paul Moore + * Huw Davies + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Huw Davies , 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_user.h" +#include "netlabel_calipso.h" +#include "netlabel_mgmt.h" +#include "netlabel_domainhash.h" + +/* Argument struct for calipso_doi_walk() */ +struct netlbl_calipso_doiwalk_arg { + struct netlink_callback *nl_cb; + struct sk_buff *skb; + u32 seq; +}; + +/* Argument struct for netlbl_domhsh_walk() */ +struct netlbl_domhsh_walk_arg { + struct netlbl_audit *audit_info; + u32 doi; +}; + +/* NetLabel Generic NETLINK CALIPSO family */ +static struct genl_family netlbl_calipso_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = NETLBL_NLTYPE_CALIPSO_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_CALIPSO_A_MAX, +}; + +/* NetLabel Netlink attribute policy */ +static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { + [NLBL_CALIPSO_A_DOI] = { .type = NLA_U32 }, + [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 }, +}; + +/* NetLabel Command Handlers + */ +/** + * netlbl_calipso_add_pass - Adds a CALIPSO pass DOI definition + * @info: the Generic NETLINK info block + * @audit_info: NetLabel audit information + * + * Description: + * Create a new CALIPSO_MAP_PASS DOI definition based on the given ADD message + * and add it to the CALIPSO engine. Return zero on success and non-zero on + * error. + * + */ +static int netlbl_calipso_add_pass(struct genl_info *info, + struct netlbl_audit *audit_info) +{ + int ret_val; + struct calipso_doi *doi_def = NULL; + + doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + if (!doi_def) + return -ENOMEM; + doi_def->type = CALIPSO_MAP_PASS; + doi_def->doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + ret_val = calipso_doi_add(doi_def, audit_info); + if (ret_val != 0) + calipso_doi_free(doi_def); + + return ret_val; +} + +/** + * netlbl_calipso_add - Handle an ADD message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Create a new DOI definition based on the given ADD message and add it to the + * CALIPSO engine. Returns zero on success, negative values on failure. + * + */ +static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) + +{ + int ret_val = -EINVAL; + struct netlbl_audit audit_info; + + if (!info->attrs[NLBL_CALIPSO_A_DOI] || + !info->attrs[NLBL_CALIPSO_A_MTYPE]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { + case CALIPSO_MAP_PASS: + ret_val = netlbl_calipso_add_pass(info, &audit_info); + break; + } + if (ret_val == 0) + atomic_inc(&netlabel_mgmt_protocount); + + return ret_val; +} + +/** + * netlbl_calipso_list - Handle a LIST message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated LIST message and respond accordingly. + * Returns zero on success and negative values on error. + * + */ +static int netlbl_calipso_list(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val; + struct sk_buff *ans_skb = NULL; + void *data; + u32 doi; + struct calipso_doi *doi_def; + + if (!info->attrs[NLBL_CALIPSO_A_DOI]) { + ret_val = -EINVAL; + goto list_failure; + } + + doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + + doi_def = calipso_doi_getdef(doi); + if (!doi_def) { + ret_val = -EINVAL; + goto list_failure; + } + + ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ans_skb) { + ret_val = -ENOMEM; + goto list_failure_put; + } + data = genlmsg_put_reply(ans_skb, info, &netlbl_calipso_gnl_family, + 0, NLBL_CALIPSO_C_LIST); + if (!data) { + ret_val = -ENOMEM; + goto list_failure_put; + } + + ret_val = nla_put_u32(ans_skb, NLBL_CALIPSO_A_MTYPE, doi_def->type); + if (ret_val != 0) + goto list_failure_put; + + calipso_doi_putdef(doi_def); + + genlmsg_end(ans_skb, data); + return genlmsg_reply(ans_skb, info); + +list_failure_put: + calipso_doi_putdef(doi_def); +list_failure: + kfree_skb(ans_skb); + return ret_val; +} + +/** + * netlbl_calipso_listall_cb - calipso_doi_walk() callback for LISTALL + * @doi_def: the CALIPSO DOI definition + * @arg: the netlbl_calipso_doiwalk_arg structure + * + * Description: + * This function is designed to be used as a callback to the + * calipso_doi_walk() function for use in generating a response for a LISTALL + * message. Returns the size of the message on success, negative values on + * failure. + * + */ +static int netlbl_calipso_listall_cb(struct calipso_doi *doi_def, void *arg) +{ + int ret_val = -ENOMEM; + struct netlbl_calipso_doiwalk_arg *cb_arg = arg; + void *data; + + data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, + cb_arg->seq, &netlbl_calipso_gnl_family, + NLM_F_MULTI, NLBL_CALIPSO_C_LISTALL); + if (!data) + goto listall_cb_failure; + + ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_DOI, doi_def->doi); + if (ret_val != 0) + goto listall_cb_failure; + ret_val = nla_put_u32(cb_arg->skb, + NLBL_CALIPSO_A_MTYPE, + doi_def->type); + if (ret_val != 0) + goto listall_cb_failure; + + genlmsg_end(cb_arg->skb, data); + return 0; + +listall_cb_failure: + genlmsg_cancel(cb_arg->skb, data); + return ret_val; +} + +/** + * netlbl_calipso_listall - Handle a LISTALL message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated LISTALL message and respond accordingly. Returns + * zero on success and negative values on error. + * + */ +static int netlbl_calipso_listall(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_calipso_doiwalk_arg cb_arg; + u32 doi_skip = cb->args[0]; + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + calipso_doi_walk(&doi_skip, netlbl_calipso_listall_cb, &cb_arg); + + cb->args[0] = doi_skip; + return skb->len; +} + +/** + * netlbl_calipso_remove_cb - netlbl_calipso_remove() callback for REMOVE + * @entry: LSM domain mapping entry + * @arg: the netlbl_domhsh_walk_arg structure + * + * Description: + * This function is intended for use by netlbl_calipso_remove() as the callback + * for the netlbl_domhsh_walk() function; it removes LSM domain map entries + * which are associated with the CALIPSO DOI specified in @arg. Returns zero on + * success, negative values on failure. + * + */ +static int netlbl_calipso_remove_cb(struct netlbl_dom_map *entry, void *arg) +{ + struct netlbl_domhsh_walk_arg *cb_arg = arg; + + if (entry->def.type == NETLBL_NLTYPE_CALIPSO && + entry->def.calipso->doi == cb_arg->doi) + return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); + + return 0; +} + +/** + * netlbl_calipso_remove - Handle a REMOVE message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated REMOVE message and respond accordingly. Returns + * zero on success, negative values on failure. + * + */ +static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val = -EINVAL; + struct netlbl_domhsh_walk_arg cb_arg; + struct netlbl_audit audit_info; + u32 skip_bkt = 0; + u32 skip_chain = 0; + + if (!info->attrs[NLBL_CALIPSO_A_DOI]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + cb_arg.audit_info = &audit_info; + ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, + netlbl_calipso_remove_cb, &cb_arg); + if (ret_val == 0 || ret_val == -ENOENT) { + ret_val = calipso_doi_remove(cb_arg.doi, &audit_info); + if (ret_val == 0) + atomic_dec(&netlabel_mgmt_protocount); + } + + return ret_val; +} + +/* NetLabel Generic NETLINK Command Definitions + */ + +static const struct genl_ops netlbl_calipso_ops[] = { + { + .cmd = NLBL_CALIPSO_C_ADD, + .flags = GENL_ADMIN_PERM, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_add, + .dumpit = NULL, + }, + { + .cmd = NLBL_CALIPSO_C_REMOVE, + .flags = GENL_ADMIN_PERM, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_remove, + .dumpit = NULL, + }, + { + .cmd = NLBL_CALIPSO_C_LIST, + .flags = 0, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_list, + .dumpit = NULL, + }, + { + .cmd = NLBL_CALIPSO_C_LISTALL, + .flags = 0, + .policy = calipso_genl_policy, + .doit = NULL, + .dumpit = netlbl_calipso_listall, + }, +}; + +/* NetLabel Generic NETLINK Protocol Functions + */ + +/** + * netlbl_calipso_genl_init - Register the CALIPSO NetLabel component + * + * Description: + * Register the CALIPSO packet NetLabel component with the Generic NETLINK + * mechanism. Returns zero on success, negative values on failure. + * + */ +int __init netlbl_calipso_genl_init(void) +{ + return genl_register_family_with_ops(&netlbl_calipso_gnl_family, + netlbl_calipso_ops); +} + +static const struct netlbl_calipso_ops *calipso_ops; + +/** + * netlbl_calipso_ops_register - Register the CALIPSO operations + * + * Description: + * Register the CALIPSO packet engine operations. + * + */ +const struct netlbl_calipso_ops * +netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) +{ + return xchg(&calipso_ops, ops); +} +EXPORT_SYMBOL(netlbl_calipso_ops_register); + +static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) +{ + return ACCESS_ONCE(calipso_ops); +} + +/** + * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine + * @doi_def: the DOI structure + * @audit_info: NetLabel audit information + * + * Description: + * The caller defines a new DOI for use by the CALIPSO engine and calls this + * function to add it to the list of acceptable domains. The caller must + * ensure that the mapping table specified in @doi_def->map meets all of the + * requirements of the mapping type (see calipso.h for details). Returns + * zero on success and non-zero on failure. + * + */ +int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_add(doi_def, audit_info); + return ret_val; +} + +/** + * calipso_doi_free - Frees a DOI definition + * @doi_def: the DOI definition + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +void calipso_doi_free(struct calipso_doi *doi_def) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->doi_free(doi_def); +} + +/** + * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine + * @doi: the DOI value + * @audit_secid: the LSM secid to use in the audit message + * + * Description: + * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. + * + */ +int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_remove(doi, audit_info); + return ret_val; +} + +/** + * calipso_doi_getdef - Returns a reference to a valid DOI definition + * @doi: the DOI value + * + * Description: + * Searches for a valid DOI definition and if one is found it is returned to + * the caller. Otherwise NULL is returned. The caller must ensure that + * calipso_doi_putdef() is called when the caller is done. + * + */ +struct calipso_doi *calipso_doi_getdef(u32 doi) +{ + struct calipso_doi *ret_val = NULL; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_getdef(doi); + return ret_val; +} + +/** + * calipso_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from calipso_doi_getdef(). + * + */ +void calipso_doi_putdef(struct calipso_doi *doi_def) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->doi_putdef(doi_def); +} + +/** + * calipso_doi_walk - Iterate through the DOI definitions + * @skip_cnt: skip past this number of DOI definitions, updated + * @callback: callback for each DOI definition + * @cb_arg: argument for the callback function + * + * Description: + * Iterate over the DOI definition list, skipping the first @skip_cnt entries. + * For each entry call @callback, if @callback returns a negative value stop + * 'walking' through the list and return. Updates the value in @skip_cnt upon + * return. Returns zero on success, negative values on failure. + * + */ +int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, void *arg), + void *cb_arg) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_walk(skip_cnt, callback, cb_arg); + return ret_val; +} + +/** + * calipso_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CALIPSO option attached to the sock and if + * there is return the CALIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->sock_getattr(sk, secattr); + return ret_val; +} + +/** + * calipso_sock_setattr - Add a CALIPSO option to a socket + * @sk: the socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->sock_setattr(sk, doi_def, secattr); + return ret_val; +} + +/** + * calipso_sock_delattr - Delete the CALIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CALIPSO option from a socket, if present. + * + */ +void calipso_sock_delattr(struct sock *sk) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->sock_delattr(sk); +} + +/** + * calipso_req_setattr - Add a CALIPSO option to a connection request socket + * @req: the connection request socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. Returns zero on success and + * negative values on failure. + * + */ +int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->req_setattr(req, doi_def, secattr); + return ret_val; +} + +/** + * calipso_req_delattr - Delete the CALIPSO option from a request socket + * @reg: the request socket + * + * Description: + * Removes the CALIPSO option from a request socket, if present. + * + */ +void calipso_req_delattr(struct request_sock *req) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->req_delattr(req); +} + +/** + * calipso_optptr - Find the CALIPSO option in the packet + * @skb: the packet + * + * Description: + * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer + * to the start of the CALIPSO option on success, NULL if one if not found. + * + */ +unsigned char *calipso_optptr(const struct sk_buff *skb) +{ + unsigned char *ret_val = NULL; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_optptr(skb); + return ret_val; +} + +/** + * calipso_getattr - Get the security attributes from a memory block. + * @calipso: the CALIPSO option + * @secattr: the security attributes + * + * Description: + * Inspect @calipso and return the security attributes in @secattr. + * Returns zero on success and negative values on failure. + * + */ +int calipso_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->opt_getattr(calipso, secattr); + return ret_val; +} + +/** + * calipso_skbuff_setattr - Set the CALIPSO option on a packet + * @skb: the packet + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Set the CALIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_setattr(skb, doi_def, secattr); + return ret_val; +} + +/** + * calipso_skbuff_delattr - Delete any CALIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CALIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +int calipso_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_delattr(skb); + return ret_val; +} + +/** + * calipso_cache_invalidate - Invalidates the current CALIPSO cache + * + * Description: + * Invalidates and frees any entries in the CALIPSO cache. Returns zero on + * success and negative values on failure. + * + */ +void calipso_cache_invalidate(void) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->cache_invalidate(); +} + +/** + * calipso_cache_add - Add an entry to the CALIPSO cache + * @calipso_ptr: the CALIPSO option + * @secattr: the packet's security attributes + * + * Description: + * Add a new entry into the CALIPSO label mapping cache. + * Returns zero on success, negative values on failure. + * + */ +int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr) + +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->cache_add(calipso_ptr, secattr); + return ret_val; +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h new file mode 100644 index 000000000..9fd291cd0 --- /dev/null +++ b/net/netlabel/netlabel_calipso.h @@ -0,0 +1,151 @@ +/* + * NetLabel CALIPSO Support + * + * This file defines the CALIPSO functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Authors: Paul Moore + * Huw Davies + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Huw Davies , 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +#ifndef _NETLABEL_CALIPSO +#define _NETLABEL_CALIPSO + +#include +#include + +/* The following NetLabel payloads are supported by the CALIPSO subsystem. + * + * o ADD: + * Sent by an application to add a new DOI mapping table. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * NLBL_CALIPSO_A_MTYPE + * + * If using CALIPSO_MAP_PASS no additional attributes are required. + * + * o REMOVE: + * Sent by an application to remove a specific DOI mapping table from the + * CALIPSO system. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * + * o LIST: + * Sent by an application to list the details of a DOI definition. On + * success the kernel should send a response using the following format. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * + * The valid response message format depends on the type of the DOI mapping, + * the defined formats are shown below. + * + * Required attributes: + * + * NLBL_CALIPSO_A_MTYPE + * + * If using CALIPSO_MAP_PASS no additional attributes are required. + * + * o LISTALL: + * This message is sent by an application to list the valid DOIs on the + * system. When sent by an application there is no payload and the + * NLM_F_DUMP flag should be set. The kernel should respond with a series of + * the following messages. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * NLBL_CALIPSO_A_MTYPE + * + */ + +/* NetLabel CALIPSO commands */ +enum { + NLBL_CALIPSO_C_UNSPEC, + NLBL_CALIPSO_C_ADD, + NLBL_CALIPSO_C_REMOVE, + NLBL_CALIPSO_C_LIST, + NLBL_CALIPSO_C_LISTALL, + __NLBL_CALIPSO_C_MAX, +}; + +/* NetLabel CALIPSO attributes */ +enum { + NLBL_CALIPSO_A_UNSPEC, + NLBL_CALIPSO_A_DOI, + /* (NLA_U32) + * the DOI value */ + NLBL_CALIPSO_A_MTYPE, + /* (NLA_U32) + * the mapping table type (defined in the calipso.h header as + * CALIPSO_MAP_*) */ + __NLBL_CALIPSO_A_MAX, +}; + +#define NLBL_CALIPSO_A_MAX (__NLBL_CALIPSO_A_MAX - 1) + +/* NetLabel protocol functions */ +#if IS_ENABLED(CONFIG_IPV6) +int netlbl_calipso_genl_init(void); +#else +static inline int netlbl_calipso_genl_init(void) +{ + return 0; +} +#endif + +int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info); +void calipso_doi_free(struct calipso_doi *doi_def); +int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info); +struct calipso_doi *calipso_doi_getdef(u32 doi); +void calipso_doi_putdef(struct calipso_doi *doi_def); +int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, void *arg), + void *cb_arg); +int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); +int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +void calipso_sock_delattr(struct sock *sk); +int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +void calipso_req_delattr(struct request_sock *req); +unsigned char *calipso_optptr(const struct sk_buff *skb); +int calipso_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr); +int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +int calipso_skbuff_delattr(struct sk_buff *skb); +void calipso_cache_invalidate(void); +int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr); + +#endif diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index ada674222..41d0e95d1 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -37,10 +37,12 @@ #include #include #include +#include #include #include "netlabel_mgmt.h" #include "netlabel_addrlist.h" +#include "netlabel_calipso.h" #include "netlabel_domainhash.h" #include "netlabel_user.h" @@ -55,8 +57,9 @@ struct netlbl_domhsh_tbl { static DEFINE_SPINLOCK(netlbl_domhsh_lock); #define netlbl_domhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) -static struct netlbl_domhsh_tbl *netlbl_domhsh; -static struct netlbl_dom_map *netlbl_domhsh_def; +static struct netlbl_domhsh_tbl __rcu *netlbl_domhsh; +static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv4; +static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv6; /* * Domain Hash Table Helper Functions @@ -126,18 +129,26 @@ static u32 netlbl_domhsh_hash(const char *key) return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1); } +static bool netlbl_family_match(u16 f1, u16 f2) +{ + return (f1 == f2) || (f1 == AF_UNSPEC) || (f2 == AF_UNSPEC); +} + /** * netlbl_domhsh_search - Search for a domain entry * @domain: the domain + * @family: the address family * * Description: * Searches the domain hash table and returns a pointer to the hash table - * entry if found, otherwise NULL is returned. The caller is responsible for + * entry if found, otherwise NULL is returned. @family may be %AF_UNSPEC + * which matches any address family entries. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or the * hash table lock. * */ -static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) +static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, + u16 family) { u32 bkt; struct list_head *bkt_list; @@ -147,7 +158,9 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) bkt = netlbl_domhsh_hash(domain); bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; list_for_each_entry_rcu(iter, bkt_list, list) - if (iter->valid && strcmp(iter->domain, domain) == 0) + if (iter->valid && + netlbl_family_match(iter->family, family) && + strcmp(iter->domain, domain) == 0) return iter; } @@ -157,28 +170,37 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) /** * netlbl_domhsh_search_def - Search for a domain entry * @domain: the domain - * @def: return default if no match is found + * @family: the address family * * Description: * Searches the domain hash table and returns a pointer to the hash table * entry if an exact match is found, if an exact match is not present in the * hash table then the default entry is returned if valid otherwise NULL is - * returned. The caller is responsible ensuring that the hash table is + * returned. @family may be %AF_UNSPEC which matches any address family + * entries. The caller is responsible ensuring that the hash table is * protected with either a RCU read lock or the hash table lock. * */ -static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) +static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain, + u16 family) { struct netlbl_dom_map *entry; - entry = netlbl_domhsh_search(domain); - if (entry == NULL) { - entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def); - if (entry != NULL && !entry->valid) - entry = NULL; + entry = netlbl_domhsh_search(domain, family); + if (entry != NULL) + return entry; + if (family == AF_INET || family == AF_UNSPEC) { + entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv4); + if (entry != NULL && entry->valid) + return entry; + } + if (family == AF_INET6 || family == AF_UNSPEC) { + entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv6); + if (entry != NULL && entry->valid) + return entry; } - return entry; + return NULL; } /** @@ -203,6 +225,7 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, { struct audit_buffer *audit_buf; struct cipso_v4_doi *cipsov4 = NULL; + struct calipso_doi *calipso = NULL; u32 type; audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); @@ -221,12 +244,14 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, struct netlbl_domaddr6_map *map6; map6 = netlbl_domhsh_addr6_entry(addr6); type = map6->def.type; + calipso = map6->def.calipso; netlbl_af6list_audit_addr(audit_buf, 0, NULL, &addr6->addr, &addr6->mask); #endif /* IPv6 */ } else { type = entry->def.type; cipsov4 = entry->def.cipso; + calipso = entry->def.calipso; } switch (type) { case NETLBL_NLTYPE_UNLABELED: @@ -238,6 +263,12 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, " nlbl_protocol=cipsov4 cipso_doi=%u", cipsov4->doi); break; + case NETLBL_NLTYPE_CALIPSO: + BUG_ON(calipso == NULL); + audit_log_format(audit_buf, + " nlbl_protocol=calipso calipso_doi=%u", + calipso->doi); + break; } audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); audit_log_end(audit_buf); @@ -264,13 +295,25 @@ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry) if (entry == NULL) return -EINVAL; + if (entry->family != AF_INET && entry->family != AF_INET6 && + (entry->family != AF_UNSPEC || + entry->def.type != NETLBL_NLTYPE_UNLABELED)) + return -EINVAL; + switch (entry->def.type) { case NETLBL_NLTYPE_UNLABELED: - if (entry->def.cipso != NULL || entry->def.addrsel != NULL) + if (entry->def.cipso != NULL || entry->def.calipso != NULL || + entry->def.addrsel != NULL) return -EINVAL; break; case NETLBL_NLTYPE_CIPSOV4: - if (entry->def.cipso == NULL) + if (entry->family != AF_INET || + entry->def.cipso == NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_CALIPSO: + if (entry->family != AF_INET6 || + entry->def.calipso == NULL) return -EINVAL; break; case NETLBL_NLTYPE_ADDRSELECT: @@ -294,6 +337,12 @@ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry) map6 = netlbl_domhsh_addr6_entry(iter6); switch (map6->def.type) { case NETLBL_NLTYPE_UNLABELED: + if (map6->def.calipso != NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_CALIPSO: + if (map6->def.calipso == NULL) + return -EINVAL; break; default: return -EINVAL; @@ -358,15 +407,18 @@ int __init netlbl_domhsh_init(u32 size) * * Description: * Adds a new entry to the domain hash table and handles any updates to the - * lower level protocol handler (i.e. CIPSO). Returns zero on success, - * negative on failure. + * lower level protocol handler (i.e. CIPSO). @entry->family may be set to + * %AF_UNSPEC which will add an entry that matches all address families. This + * is only useful for the unlabelled type and will only succeed if there is no + * existing entry for any address family with the same domain. Returns zero + * on success, negative on failure. * */ int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { int ret_val = 0; - struct netlbl_dom_map *entry_old; + struct netlbl_dom_map *entry_old, *entry_b; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) @@ -385,9 +437,10 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, rcu_read_lock(); spin_lock(&netlbl_domhsh_lock); if (entry->domain != NULL) - entry_old = netlbl_domhsh_search(entry->domain); + entry_old = netlbl_domhsh_search(entry->domain, entry->family); else - entry_old = netlbl_domhsh_search_def(entry->domain); + entry_old = netlbl_domhsh_search_def(entry->domain, + entry->family); if (entry_old == NULL) { entry->valid = 1; @@ -397,7 +450,41 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, &rcu_dereference(netlbl_domhsh)->tbl[bkt]); } else { INIT_LIST_HEAD(&entry->list); - rcu_assign_pointer(netlbl_domhsh_def, entry); + switch (entry->family) { + case AF_INET: + rcu_assign_pointer(netlbl_domhsh_def_ipv4, + entry); + break; + case AF_INET6: + rcu_assign_pointer(netlbl_domhsh_def_ipv6, + entry); + break; + case AF_UNSPEC: + if (entry->def.type != + NETLBL_NLTYPE_UNLABELED) { + ret_val = -EINVAL; + goto add_return; + } + entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC); + if (entry_b == NULL) { + ret_val = -ENOMEM; + goto add_return; + } + entry_b->family = AF_INET6; + entry_b->def.type = NETLBL_NLTYPE_UNLABELED; + entry_b->valid = 1; + entry->family = AF_INET; + rcu_assign_pointer(netlbl_domhsh_def_ipv4, + entry); + rcu_assign_pointer(netlbl_domhsh_def_ipv6, + entry_b); + break; + default: + /* Already checked in + * netlbl_domhsh_validate(). */ + ret_val = -EINVAL; + goto add_return; + } } if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) { @@ -513,10 +600,12 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, spin_lock(&netlbl_domhsh_lock); if (entry->valid) { entry->valid = 0; - if (entry != rcu_dereference(netlbl_domhsh_def)) - list_del_rcu(&entry->list); + if (entry == rcu_dereference(netlbl_domhsh_def_ipv4)) + RCU_INIT_POINTER(netlbl_domhsh_def_ipv4, NULL); + else if (entry == rcu_dereference(netlbl_domhsh_def_ipv6)) + RCU_INIT_POINTER(netlbl_domhsh_def_ipv6, NULL); else - RCU_INIT_POINTER(netlbl_domhsh_def, NULL); + list_del_rcu(&entry->list); } else ret_val = -ENOENT; spin_unlock(&netlbl_domhsh_lock); @@ -533,6 +622,10 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, if (ret_val == 0) { struct netlbl_af4list *iter4; struct netlbl_domaddr4_map *map4; +#if IS_ENABLED(CONFIG_IPV6) + struct netlbl_af6list *iter6; + struct netlbl_domaddr6_map *map6; +#endif /* IPv6 */ switch (entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: @@ -541,12 +634,22 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, map4 = netlbl_domhsh_addr4_entry(iter4); cipso_v4_doi_putdef(map4->def.cipso); } - /* no need to check the IPv6 list since we currently - * support only unlabeled protocols for IPv6 */ +#if IS_ENABLED(CONFIG_IPV6) + netlbl_af6list_foreach_rcu(iter6, + &entry->def.addrsel->list6) { + map6 = netlbl_domhsh_addr6_entry(iter6); + calipso_doi_putdef(map6->def.calipso); + } +#endif /* IPv6 */ break; case NETLBL_NLTYPE_CIPSOV4: cipso_v4_doi_putdef(entry->def.cipso); break; +#if IS_ENABLED(CONFIG_IPV6) + case NETLBL_NLTYPE_CALIPSO: + calipso_doi_putdef(entry->def.calipso); + break; +#endif /* IPv6 */ } call_rcu(&entry->rcu, netlbl_domhsh_free_entry); } @@ -583,9 +686,9 @@ int netlbl_domhsh_remove_af4(const char *domain, rcu_read_lock(); if (domain) - entry_map = netlbl_domhsh_search(domain); + entry_map = netlbl_domhsh_search(domain, AF_INET); else - entry_map = netlbl_domhsh_search_def(domain); + entry_map = netlbl_domhsh_search_def(domain, AF_INET); if (entry_map == NULL || entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) goto remove_af4_failure; @@ -622,28 +725,114 @@ remove_af4_failure: return -ENOENT; } +#if IS_ENABLED(CONFIG_IPV6) +/** + * netlbl_domhsh_remove_af6 - Removes an address selector entry + * @domain: the domain + * @addr: IPv6 address + * @mask: IPv6 address mask + * @audit_info: NetLabel audit information + * + * Description: + * Removes an individual address selector from a domain mapping and potentially + * the entire mapping if it is empty. Returns zero on success, negative values + * on failure. + * + */ +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ + struct netlbl_dom_map *entry_map; + struct netlbl_af6list *entry_addr; + struct netlbl_af4list *iter4; + struct netlbl_af6list *iter6; + struct netlbl_domaddr6_map *entry; + + rcu_read_lock(); + + if (domain) + entry_map = netlbl_domhsh_search(domain, AF_INET6); + else + entry_map = netlbl_domhsh_search_def(domain, AF_INET6); + if (entry_map == NULL || + entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) + goto remove_af6_failure; + + spin_lock(&netlbl_domhsh_lock); + entry_addr = netlbl_af6list_remove(addr, mask, + &entry_map->def.addrsel->list6); + spin_unlock(&netlbl_domhsh_lock); + + if (entry_addr == NULL) + goto remove_af6_failure; + netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4) + goto remove_af6_single_addr; + netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6) + goto remove_af6_single_addr; + /* the domain mapping is empty so remove it from the mapping table */ + netlbl_domhsh_remove_entry(entry_map, audit_info); + +remove_af6_single_addr: + rcu_read_unlock(); + /* yick, we can't use call_rcu here because we don't have a rcu head + * pointer but hopefully this should be a rare case so the pause + * shouldn't be a problem */ + synchronize_rcu(); + entry = netlbl_domhsh_addr6_entry(entry_addr); + calipso_doi_putdef(entry->def.calipso); + kfree(entry); + return 0; + +remove_af6_failure: + rcu_read_unlock(); + return -ENOENT; +} +#endif /* IPv6 */ + /** * netlbl_domhsh_remove - Removes an entry from the domain hash table * @domain: the domain to remove + * @family: address family * @audit_info: NetLabel audit information * * Description: * Removes an entry from the domain hash table and handles any updates to the - * lower level protocol handler (i.e. CIPSO). Returns zero on success, - * negative on failure. + * lower level protocol handler (i.e. CIPSO). @family may be %AF_UNSPEC which + * removes all address family entries. Returns zero on success, negative on + * failure. * */ -int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) +int netlbl_domhsh_remove(const char *domain, u16 family, + struct netlbl_audit *audit_info) { - int ret_val; + int ret_val = -EINVAL; struct netlbl_dom_map *entry; rcu_read_lock(); - if (domain) - entry = netlbl_domhsh_search(domain); - else - entry = netlbl_domhsh_search_def(domain); - ret_val = netlbl_domhsh_remove_entry(entry, audit_info); + + if (family == AF_INET || family == AF_UNSPEC) { + if (domain) + entry = netlbl_domhsh_search(domain, AF_INET); + else + entry = netlbl_domhsh_search_def(domain, AF_INET); + ret_val = netlbl_domhsh_remove_entry(entry, audit_info); + if (ret_val && ret_val != -ENOENT) + goto done; + } + if (family == AF_INET6 || family == AF_UNSPEC) { + int ret_val2; + + if (domain) + entry = netlbl_domhsh_search(domain, AF_INET6); + else + entry = netlbl_domhsh_search_def(domain, AF_INET6); + ret_val2 = netlbl_domhsh_remove_entry(entry, audit_info); + if (ret_val2 != -ENOENT) + ret_val = ret_val2; + } +done: rcu_read_unlock(); return ret_val; @@ -651,32 +840,38 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) /** * netlbl_domhsh_remove_default - Removes the default entry from the table + * @family: address family * @audit_info: NetLabel audit information * * Description: - * Removes/resets the default entry for the domain hash table and handles any - * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on - * success, non-zero on failure. + * Removes/resets the default entry corresponding to @family from the domain + * hash table and handles any updates to the lower level protocol handler + * (i.e. CIPSO). @family may be %AF_UNSPEC which removes all address family + * entries. Returns zero on success, negative on failure. * */ -int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) +int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info) { - return netlbl_domhsh_remove(NULL, audit_info); + return netlbl_domhsh_remove(NULL, family, audit_info); } /** * netlbl_domhsh_getentry - Get an entry from the domain hash table * @domain: the domain name to search for + * @family: address family * * Description: * Look through the domain hash table searching for an entry to match @domain, - * return a pointer to a copy of the entry or NULL. The caller is responsible - * for ensuring that rcu_read_[un]lock() is called. + * with address family @family, return a pointer to a copy of the entry or + * NULL. The caller is responsible for ensuring that rcu_read_[un]lock() is + * called. * */ -struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) +struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family) { - return netlbl_domhsh_search_def(domain); + if (family == AF_UNSPEC) + return NULL; + return netlbl_domhsh_search_def(domain, family); } /** @@ -696,7 +891,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain, struct netlbl_dom_map *dom_iter; struct netlbl_af4list *addr_iter; - dom_iter = netlbl_domhsh_search_def(domain); + dom_iter = netlbl_domhsh_search_def(domain, AF_INET); if (dom_iter == NULL) return NULL; @@ -726,7 +921,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, struct netlbl_dom_map *dom_iter; struct netlbl_af6list *addr_iter; - dom_iter = netlbl_domhsh_search_def(domain); + dom_iter = netlbl_domhsh_search_def(domain, AF_INET6); if (dom_iter == NULL) return NULL; diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h index 680caf4df..1f9247781 100644 --- a/net/netlabel/netlabel_domainhash.h +++ b/net/netlabel/netlabel_domainhash.h @@ -51,6 +51,7 @@ struct netlbl_dommap_def { union { struct netlbl_domaddr_map *addrsel; struct cipso_v4_doi *cipso; + struct calipso_doi *calipso; }; }; #define netlbl_domhsh_addr4_entry(iter) \ @@ -70,6 +71,7 @@ struct netlbl_domaddr6_map { struct netlbl_dom_map { char *domain; + u16 family; struct netlbl_dommap_def def; u32 valid; @@ -91,14 +93,23 @@ int netlbl_domhsh_remove_af4(const char *domain, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info); -int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); -int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info); -struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain); +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info); +int netlbl_domhsh_remove(const char *domain, u16 family, + struct netlbl_audit *audit_info); +int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info); +struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family); struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain, __be32 addr); #if IS_ENABLED(CONFIG_IPV6) struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, const struct in6_addr *addr); +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info); #endif /* IPv6 */ int netlbl_domhsh_walk(u32 *skip_bkt, diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index bd007a9fd..28c56b95f 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -37,12 +37,14 @@ #include #include #include +#include #include #include #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_cipso_v4.h" +#include "netlabel_calipso.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" #include "netlabel_addrlist.h" @@ -72,12 +74,17 @@ int netlbl_cfg_map_del(const char *domain, struct netlbl_audit *audit_info) { if (addr == NULL && mask == NULL) { - return netlbl_domhsh_remove(domain, audit_info); + return netlbl_domhsh_remove(domain, family, audit_info); } else if (addr != NULL && mask != NULL) { switch (family) { case AF_INET: return netlbl_domhsh_remove_af4(domain, addr, mask, audit_info); +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return netlbl_domhsh_remove_af6(domain, addr, mask, + audit_info); +#endif /* IPv6 */ default: return -EPFNOSUPPORT; } @@ -119,6 +126,7 @@ int netlbl_cfg_unlbl_map_add(const char *domain, if (entry->domain == NULL) goto cfg_unlbl_map_add_failure; } + entry->family = family; if (addr == NULL && mask == NULL) entry->def.type = NETLBL_NLTYPE_UNLABELED; @@ -345,6 +353,7 @@ int netlbl_cfg_cipsov4_map_add(u32 doi, entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) goto out_entry; + entry->family = AF_INET; if (domain != NULL) { entry->domain = kstrdup(domain, GFP_ATOMIC); if (entry->domain == NULL) @@ -399,6 +408,139 @@ out_entry: return ret_val; } +/** + * netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition + * @doi_def: CALIPSO DOI definition + * @audit_info: NetLabel audit information + * + * Description: + * Add a new CALIPSO DOI definition as defined by @doi_def. Returns zero on + * success and negative values on failure. + * + */ +int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + return calipso_doi_add(doi_def, audit_info); +#else /* IPv6 */ + return -ENOSYS; +#endif /* IPv6 */ +} + +/** + * netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition + * @doi: CALIPSO DOI + * @audit_info: NetLabel audit information + * + * Description: + * Remove an existing CALIPSO DOI definition matching @doi. Returns zero on + * success and negative values on failure. + * + */ +void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + calipso_doi_remove(doi, audit_info); +#endif /* IPv6 */ +} + +/** + * netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping + * @doi: the CALIPSO DOI + * @domain: the domain mapping to add + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the + * NetLabel subsystem. A @domain value of NULL adds a new default domain + * mapping. Returns zero on success, negative values on failure. + * + */ +int netlbl_cfg_calipso_map_add(u32 doi, + const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + int ret_val = -ENOMEM; + struct calipso_doi *doi_def; + struct netlbl_dom_map *entry; + struct netlbl_domaddr_map *addrmap = NULL; + struct netlbl_domaddr6_map *addrinfo = NULL; + + doi_def = calipso_doi_getdef(doi); + if (doi_def == NULL) + return -ENOENT; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + goto out_entry; + entry->family = AF_INET6; + if (domain != NULL) { + entry->domain = kstrdup(domain, GFP_ATOMIC); + if (entry->domain == NULL) + goto out_domain; + } + + if (addr == NULL && mask == NULL) { + entry->def.calipso = doi_def; + entry->def.type = NETLBL_NLTYPE_CALIPSO; + } else if (addr != NULL && mask != NULL) { + addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); + if (addrmap == NULL) + goto out_addrmap; + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC); + if (addrinfo == NULL) + goto out_addrinfo; + addrinfo->def.calipso = doi_def; + addrinfo->def.type = NETLBL_NLTYPE_CALIPSO; + addrinfo->list.addr = *addr; + addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; + addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; + addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; + addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; + addrinfo->list.mask = *mask; + addrinfo->list.valid = 1; + ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6); + if (ret_val != 0) + goto cfg_calipso_map_add_failure; + + entry->def.addrsel = addrmap; + entry->def.type = NETLBL_NLTYPE_ADDRSELECT; + } else { + ret_val = -EINVAL; + goto out_addrmap; + } + + ret_val = netlbl_domhsh_add(entry, audit_info); + if (ret_val != 0) + goto cfg_calipso_map_add_failure; + + return 0; + +cfg_calipso_map_add_failure: + kfree(addrinfo); +out_addrinfo: + kfree(addrmap); +out_addrmap: + kfree(entry->domain); +out_domain: + kfree(entry); +out_entry: + calipso_doi_putdef(doi_def); + return ret_val; +#else /* IPv6 */ + return -ENOSYS; +#endif /* IPv6 */ +} + /* * Security Attribute Functions */ @@ -519,6 +661,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) return -ENOENT; } +EXPORT_SYMBOL(netlbl_catmap_walk); /** * netlbl_catmap_walkrng - Find the end of a string of set bits @@ -609,20 +752,19 @@ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, off = catmap->startbit; *offset = off; } - iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_NONE, 0); + iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_WALK, 0); if (iter == NULL) { *offset = (u32)-1; return 0; } if (off < iter->startbit) { - off = iter->startbit; - *offset = off; + *offset = iter->startbit; + off = 0; } else off -= iter->startbit; - idx = off / NETLBL_CATMAP_MAPSIZE; - *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_SIZE); + *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_MAPSIZE); return 0; } @@ -655,6 +797,7 @@ int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap, return 0; } +EXPORT_SYMBOL(netlbl_catmap_setbit); /** * netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap @@ -727,6 +870,76 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, return 0; } +/* Bitmap functions + */ + +/** + * netlbl_bitmap_walk - Walk a bitmap looking for a bit + * @bitmap: the bitmap + * @bitmap_len: length in bits + * @offset: starting offset + * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit + * + * Description: + * Starting at @offset, walk the bitmap from left to right until either the + * desired bit is found or we reach the end. Return the bit offset, -1 if + * not found, or -2 if error. + */ +int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, + u32 offset, u8 state) +{ + u32 bit_spot; + u32 byte_offset; + unsigned char bitmask; + unsigned char byte; + + byte_offset = offset / 8; + byte = bitmap[byte_offset]; + bit_spot = offset; + bitmask = 0x80 >> (offset % 8); + + while (bit_spot < bitmap_len) { + if ((state && (byte & bitmask) == bitmask) || + (state == 0 && (byte & bitmask) == 0)) + return bit_spot; + + bit_spot++; + bitmask >>= 1; + if (bitmask == 0) { + byte = bitmap[++byte_offset]; + bitmask = 0x80; + } + } + + return -1; +} +EXPORT_SYMBOL(netlbl_bitmap_walk); + +/** + * netlbl_bitmap_setbit - Sets a single bit in a bitmap + * @bitmap: the bitmap + * @bit: the bit + * @state: if non-zero, set the bit (1) else clear the bit (0) + * + * Description: + * Set a single bit in the bitmask. Returns zero on success, negative values + * on error. + */ +void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state) +{ + u32 byte_spot; + u8 bitmask; + + /* gcc always rounds to zero when doing integer division */ + byte_spot = bit / 8; + bitmask = 0x80 >> (bit % 8); + if (state) + bitmap[byte_spot] |= bitmask; + else + bitmap[byte_spot] &= ~bitmask; +} +EXPORT_SYMBOL(netlbl_bitmap_setbit); + /* * LSM Functions */ @@ -774,7 +987,7 @@ int netlbl_sock_setattr(struct sock *sk, struct netlbl_dom_map *dom_entry; rcu_read_lock(); - dom_entry = netlbl_domhsh_getentry(secattr->domain); + dom_entry = netlbl_domhsh_getentry(secattr->domain, family); if (dom_entry == NULL) { ret_val = -ENOENT; goto socket_setattr_return; @@ -799,9 +1012,21 @@ int netlbl_sock_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + switch (dom_entry->def.type) { + case NETLBL_NLTYPE_ADDRSELECT: + ret_val = -EDESTADDRREQ; + break; + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_sock_setattr(sk, + dom_entry->def.calipso, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -828,6 +1053,11 @@ void netlbl_sock_delattr(struct sock *sk) case AF_INET: cipso_v4_sock_delattr(sk); break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + calipso_sock_delattr(sk); + break; +#endif /* IPv6 */ } } @@ -854,7 +1084,7 @@ int netlbl_sock_getattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - ret_val = -ENOMSG; + ret_val = calipso_sock_getattr(sk, secattr); break; #endif /* IPv6 */ default: @@ -882,6 +1112,9 @@ int netlbl_conn_setattr(struct sock *sk, { int ret_val; struct sockaddr_in *addr4; +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 *addr6; +#endif struct netlbl_dommap_def *entry; rcu_read_lock(); @@ -902,7 +1135,7 @@ int netlbl_conn_setattr(struct sock *sk, case NETLBL_NLTYPE_UNLABELED: /* just delete the protocols we support for right now * but we could remove other protocols if needed */ - cipso_v4_sock_delattr(sk); + netlbl_sock_delattr(sk); ret_val = 0; break; default: @@ -911,9 +1144,27 @@ int netlbl_conn_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + addr6 = (struct sockaddr_in6 *)addr; + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &addr6->sin6_addr); + if (entry == NULL) { + ret_val = -ENOENT; + goto conn_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_sock_setattr(sk, + entry->calipso, secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + netlbl_sock_delattr(sk); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -940,12 +1191,13 @@ int netlbl_req_setattr(struct request_sock *req, { int ret_val; struct netlbl_dommap_def *entry; + struct inet_request_sock *ireq = inet_rsk(req); rcu_read_lock(); switch (req->rsk_ops->family) { case AF_INET: entry = netlbl_domhsh_getentry_af4(secattr->domain, - inet_rsk(req)->ir_rmt_addr); + ireq->ir_rmt_addr); if (entry == NULL) { ret_val = -ENOENT; goto req_setattr_return; @@ -956,9 +1208,7 @@ int netlbl_req_setattr(struct request_sock *req, entry->cipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: - /* just delete the protocols we support for right now - * but we could remove other protocols if needed */ - cipso_v4_req_delattr(req); + netlbl_req_delattr(req); ret_val = 0; break; default: @@ -967,9 +1217,24 @@ int netlbl_req_setattr(struct request_sock *req, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &ireq->ir_v6_rmt_addr); + if (entry == NULL) { + ret_val = -ENOENT; + goto req_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_req_setattr(req, + entry->calipso, secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + netlbl_req_delattr(req); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -995,6 +1260,11 @@ void netlbl_req_delattr(struct request_sock *req) case AF_INET: cipso_v4_req_delattr(req); break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + calipso_req_delattr(req); + break; +#endif /* IPv6 */ } } @@ -1015,13 +1285,17 @@ int netlbl_skbuff_setattr(struct sk_buff *skb, { int ret_val; struct iphdr *hdr4; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *hdr6; +#endif struct netlbl_dommap_def *entry; rcu_read_lock(); switch (family) { case AF_INET: hdr4 = ip_hdr(skb); - entry = netlbl_domhsh_getentry_af4(secattr->domain,hdr4->daddr); + entry = netlbl_domhsh_getentry_af4(secattr->domain, + hdr4->daddr); if (entry == NULL) { ret_val = -ENOENT; goto skbuff_setattr_return; @@ -1042,9 +1316,26 @@ int netlbl_skbuff_setattr(struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + hdr6 = ipv6_hdr(skb); + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &hdr6->daddr); + if (entry == NULL) { + ret_val = -ENOENT; + goto skbuff_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_skbuff_setattr(skb, entry->calipso, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + ret_val = calipso_skbuff_delattr(skb); + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -1083,6 +1374,9 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + ptr = calipso_optptr(skb); + if (ptr && calipso_getattr(ptr, secattr) == 0) + return 0; break; #endif /* IPv6 */ } @@ -1093,6 +1387,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, /** * netlbl_skbuff_err - Handle a LSM error on a sk_buff * @skb: the packet + * @family: the family * @error: the error code * @gateway: true if host is acting as a gateway, false otherwise * @@ -1102,10 +1397,14 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, * according to the packet's labeling protocol. * */ -void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway) +void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway) { - if (cipso_v4_optptr(skb)) - cipso_v4_error(skb, error, gateway); + switch (family) { + case AF_INET: + if (cipso_v4_optptr(skb)) + cipso_v4_error(skb, error, gateway); + break; + } } /** @@ -1120,11 +1419,15 @@ void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway) void netlbl_cache_invalidate(void) { cipso_v4_cache_invalidate(); +#if IS_ENABLED(CONFIG_IPV6) + calipso_cache_invalidate(); +#endif /* IPv6 */ } /** * netlbl_cache_add - Add an entry to a NetLabel protocol cache * @skb: the packet + * @family: the family * @secattr: the packet's security attributes * * Description: @@ -1133,7 +1436,7 @@ void netlbl_cache_invalidate(void) * values on error. * */ -int netlbl_cache_add(const struct sk_buff *skb, +int netlbl_cache_add(const struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr) { unsigned char *ptr; @@ -1141,10 +1444,20 @@ int netlbl_cache_add(const struct sk_buff *skb, if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0) return -ENOMSG; - ptr = cipso_v4_optptr(skb); - if (ptr) - return cipso_v4_cache_add(ptr, secattr); - + switch (family) { + case AF_INET: + ptr = cipso_v4_optptr(skb); + if (ptr) + return cipso_v4_cache_add(ptr, secattr); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ptr = calipso_optptr(skb); + if (ptr) + return calipso_cache_add(ptr, secattr); + break; +#endif /* IPv6 */ + } return -ENOMSG; } @@ -1169,6 +1482,7 @@ struct audit_buffer *netlbl_audit_start(int type, { return netlbl_audit_start_common(type, audit_info); } +EXPORT_SYMBOL(netlbl_audit_start); /* * Setup Functions diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 13f777f20..f85d0e07a 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -41,8 +41,10 @@ #include #include #include +#include #include +#include "netlabel_calipso.h" #include "netlabel_domainhash.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" @@ -72,6 +74,8 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { [NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 }, [NLBL_MGMT_A_VERSION] = { .type = NLA_U32 }, [NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 }, + [NLBL_MGMT_A_FAMILY] = { .type = NLA_U16 }, + [NLBL_MGMT_A_CLPDOI] = { .type = NLA_U32 }, }; /* @@ -95,6 +99,9 @@ static int netlbl_mgmt_add_common(struct genl_info *info, int ret_val = -EINVAL; struct netlbl_domaddr_map *addrmap = NULL; struct cipso_v4_doi *cipsov4 = NULL; +#if IS_ENABLED(CONFIG_IPV6) + struct calipso_doi *calipso = NULL; +#endif u32 tmp_val; struct netlbl_dom_map *entry = kzalloc(sizeof(*entry), GFP_KERNEL); @@ -119,6 +126,11 @@ static int netlbl_mgmt_add_common(struct genl_info *info, switch (entry->def.type) { case NETLBL_NLTYPE_UNLABELED: + if (info->attrs[NLBL_MGMT_A_FAMILY]) + entry->family = + nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]); + else + entry->family = AF_UNSPEC; break; case NETLBL_NLTYPE_CIPSOV4: if (!info->attrs[NLBL_MGMT_A_CV4DOI]) @@ -128,12 +140,30 @@ static int netlbl_mgmt_add_common(struct genl_info *info, cipsov4 = cipso_v4_doi_getdef(tmp_val); if (cipsov4 == NULL) goto add_free_domain; + entry->family = AF_INET; entry->def.cipso = cipsov4; break; +#if IS_ENABLED(CONFIG_IPV6) + case NETLBL_NLTYPE_CALIPSO: + if (!info->attrs[NLBL_MGMT_A_CLPDOI]) + goto add_free_domain; + + tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CLPDOI]); + calipso = calipso_doi_getdef(tmp_val); + if (calipso == NULL) + goto add_free_domain; + entry->family = AF_INET6; + entry->def.calipso = calipso; + break; +#endif /* IPv6 */ default: goto add_free_domain; } + if ((entry->family == AF_INET && info->attrs[NLBL_MGMT_A_IPV6ADDR]) || + (entry->family == AF_INET6 && info->attrs[NLBL_MGMT_A_IPV4ADDR])) + goto add_doi_put_def; + if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) { struct in_addr *addr; struct in_addr *mask; @@ -178,6 +208,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, goto add_free_addrmap; } + entry->family = AF_INET; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; entry->def.addrsel = addrmap; #if IS_ENABLED(CONFIG_IPV6) @@ -220,6 +251,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->list.mask = *mask; map->list.valid = 1; map->def.type = entry->def.type; + if (calipso) + map->def.calipso = calipso; ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); if (ret_val != 0) { @@ -227,6 +260,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, goto add_free_addrmap; } + entry->family = AF_INET6; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; entry->def.addrsel = addrmap; #endif /* IPv6 */ @@ -242,6 +276,9 @@ add_free_addrmap: kfree(addrmap); add_doi_put_def: cipso_v4_doi_putdef(cipsov4); +#if IS_ENABLED(CONFIG_IPV6) + calipso_doi_putdef(calipso); +#endif add_free_domain: kfree(entry->domain); add_free_entry: @@ -278,6 +315,10 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, return ret_val; } + ret_val = nla_put_u16(skb, NLBL_MGMT_A_FAMILY, entry->family); + if (ret_val != 0) + return ret_val; + switch (entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST); @@ -340,6 +381,15 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, if (ret_val != 0) return ret_val; + switch (map6->def.type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI, + map6->def.calipso->doi); + if (ret_val != 0) + return ret_val; + break; + } + nla_nest_end(skb, nla_b); } #endif /* IPv6 */ @@ -347,15 +397,25 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, nla_nest_end(skb, nla_a); break; case NETLBL_NLTYPE_UNLABELED: - ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type); + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + entry->def.type); break; case NETLBL_NLTYPE_CIPSOV4: - ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type); + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + entry->def.type); if (ret_val != 0) return ret_val; ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI, entry->def.cipso->doi); break; + case NETLBL_NLTYPE_CALIPSO: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + entry->def.type); + if (ret_val != 0) + return ret_val; + ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI, + entry->def.calipso->doi); + break; } return ret_val; @@ -418,7 +478,7 @@ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info) netlbl_netlink_auditinfo(skb, &audit_info); domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]); - return netlbl_domhsh_remove(domain, &audit_info); + return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info); } /** @@ -536,7 +596,7 @@ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info) netlbl_netlink_auditinfo(skb, &audit_info); - return netlbl_domhsh_remove_default(&audit_info); + return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info); } /** @@ -556,6 +616,12 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info) struct sk_buff *ans_skb = NULL; void *data; struct netlbl_dom_map *entry; + u16 family; + + if (info->attrs[NLBL_MGMT_A_FAMILY]) + family = nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]); + else + family = AF_INET; ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) @@ -566,7 +632,7 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info) goto listdef_failure; rcu_read_lock(); - entry = netlbl_domhsh_getentry(NULL); + entry = netlbl_domhsh_getentry(NULL, family); if (entry == NULL) { ret_val = -ENOENT; goto listdef_failure_lock; @@ -651,6 +717,15 @@ static int netlbl_mgmt_protocols(struct sk_buff *skb, goto protocols_return; protos_sent++; } +#if IS_ENABLED(CONFIG_IPV6) + if (protos_sent == 2) { + if (netlbl_mgmt_protocols_cb(skb, + cb, + NETLBL_NLTYPE_CALIPSO) < 0) + goto protocols_return; + protos_sent++; + } +#endif protocols_return: cb->args[0] = protos_sent; diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h index 8b6e1ab62..ea01e42bc 100644 --- a/net/netlabel/netlabel_mgmt.h +++ b/net/netlabel/netlabel_mgmt.h @@ -58,7 +58,10 @@ * * NLBL_MGMT_A_CV4DOI * - * If using NETLBL_NLTYPE_UNLABELED no other attributes are required. + * If using NETLBL_NLTYPE_UNLABELED no other attributes are required, + * however the following attribute may optionally be sent: + * + * NLBL_MGMT_A_FAMILY * * o REMOVE: * Sent by an application to remove a domain mapping from the NetLabel @@ -77,6 +80,7 @@ * Required attributes: * * NLBL_MGMT_A_DOMAIN + * NLBL_MGMT_A_FAMILY * * If the IP address selectors are not used the following attribute is * required: @@ -108,7 +112,10 @@ * * NLBL_MGMT_A_CV4DOI * - * If using NETLBL_NLTYPE_UNLABELED no other attributes are required. + * If using NETLBL_NLTYPE_UNLABELED no other attributes are required, + * however the following attribute may optionally be sent: + * + * NLBL_MGMT_A_FAMILY * * o REMOVEDEF: * Sent by an application to remove the default domain mapping from the @@ -117,13 +124,17 @@ * o LISTDEF: * This message can be sent either from an application or by the kernel in * response to an application generated LISTDEF message. When sent by an - * application there is no payload. On success the kernel should send a - * response using the following format. + * application there may be an optional payload. * - * If the IP address selectors are not used the following attribute is + * NLBL_MGMT_A_FAMILY + * + * On success the kernel should send a response using the following format: + * + * If the IP address selectors are not used the following attributes are * required: * * NLBL_MGMT_A_PROTOCOL + * NLBL_MGMT_A_FAMILY * * If the IP address selectors are used then the following attritbute is * required: @@ -209,6 +220,12 @@ enum { /* (NLA_NESTED) * the selector list, there must be at least one * NLBL_MGMT_A_ADDRSELECTOR attribute */ + NLBL_MGMT_A_FAMILY, + /* (NLA_U16) + * The address family */ + NLBL_MGMT_A_CLPDOI, + /* (NLA_U32) + * the CALIPSO DOI value */ __NLBL_MGMT_A_MAX, }; #define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 9eaa9a1e8..4528cff91 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -116,8 +116,8 @@ struct netlbl_unlhsh_walk_arg { static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) -static struct netlbl_unlhsh_tbl *netlbl_unlhsh; -static struct netlbl_unlhsh_iface *netlbl_unlhsh_def; +static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh; +static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ static u8 netlabel_unlabel_acceptflg; @@ -1537,6 +1537,7 @@ int __init netlbl_unlabel_defconf(void) entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return -ENOMEM; + entry->family = AF_UNSPEC; entry->def.type = NETLBL_NLTYPE_UNLABELED; ret_val = netlbl_domhsh_add_default(entry, &audit_info); if (ret_val != 0) diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index adf8b7900..58495f44c 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -44,6 +44,7 @@ #include "netlabel_mgmt.h" #include "netlabel_unlabeled.h" #include "netlabel_cipso_v4.h" +#include "netlabel_calipso.h" #include "netlabel_user.h" /* @@ -71,6 +72,10 @@ int __init netlbl_netlink_init(void) if (ret_val != 0) return ret_val; + ret_val = netlbl_calipso_genl_init(); + if (ret_val != 0) + return ret_val; + return netlbl_unlabel_genl_init(); } diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index e68ef9ccd..3cfd6cc60 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -8,20 +8,6 @@ #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) -struct netlink_ring { - void **pg_vec; - unsigned int head; - unsigned int frames_per_block; - unsigned int frame_size; - unsigned int frame_max; - - unsigned int pg_vec_order; - unsigned int pg_vec_pages; - unsigned int pg_vec_len; - - atomic_t pending; -}; - struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index dd9003f38..0fd5518bf 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -30,6 +30,9 @@ #define DIGITAL_PROTO_ISO15693_RF_TECH NFC_PROTO_ISO15693_MASK +/* Delay between each poll frame (ms) */ +#define DIGITAL_POLL_INTERVAL 10 + struct digital_cmd { struct list_head queue; @@ -173,6 +176,8 @@ static void digital_wq_cmd(struct work_struct *work) return; } + cmd->pending = 1; + mutex_unlock(&ddev->cmd_lock); if (cmd->req) @@ -419,7 +424,8 @@ void digital_poll_next_tech(struct nfc_digital_dev *ddev) mutex_unlock(&ddev->poll_lock); - schedule_work(&ddev->poll_work); + schedule_delayed_work(&ddev->poll_work, + msecs_to_jiffies(DIGITAL_POLL_INTERVAL)); } static void digital_wq_poll(struct work_struct *work) @@ -428,7 +434,7 @@ static void digital_wq_poll(struct work_struct *work) struct digital_poll_tech *poll_tech; struct nfc_digital_dev *ddev = container_of(work, struct nfc_digital_dev, - poll_work); + poll_work.work); mutex_lock(&ddev->poll_lock); if (!ddev->poll_tech_count) { @@ -543,7 +549,7 @@ static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols, return -EINVAL; } - schedule_work(&ddev->poll_work); + schedule_delayed_work(&ddev->poll_work, 0); return 0; } @@ -564,7 +570,7 @@ static void digital_stop_poll(struct nfc_dev *nfc_dev) mutex_unlock(&ddev->poll_lock); - cancel_work_sync(&ddev->poll_work); + cancel_delayed_work_sync(&ddev->poll_work); digital_abort_cmd(ddev); } @@ -606,6 +612,8 @@ static int digital_dep_link_down(struct nfc_dev *nfc_dev) { struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev); + digital_abort_cmd(ddev); + ddev->curr_protocol = 0; return 0; @@ -770,7 +778,7 @@ struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, INIT_WORK(&ddev->cmd_complete_work, digital_wq_cmd_complete); mutex_init(&ddev->poll_lock); - INIT_WORK(&ddev->poll_work, digital_wq_poll); + INIT_DELAYED_WORK(&ddev->poll_work, digital_wq_poll); if (supported_protocols & NFC_PROTO_JEWEL_MASK) ddev->protocols |= NFC_PROTO_JEWEL_MASK; @@ -832,12 +840,20 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev) ddev->poll_tech_count = 0; mutex_unlock(&ddev->poll_lock); - cancel_work_sync(&ddev->poll_work); + cancel_delayed_work_sync(&ddev->poll_work); cancel_work_sync(&ddev->cmd_work); cancel_work_sync(&ddev->cmd_complete_work); list_for_each_entry_safe(cmd, n, &ddev->cmd_queue, queue) { list_del(&cmd->queue); + + /* Call the command callback if any and pass it a ENODEV error. + * This gives a chance to the command issuer to free any + * allocated buffer. + */ + if (cmd->cmd_cb) + cmd->cmd_cb(ddev, cmd->cb_context, ERR_PTR(-ENODEV)); + kfree(cmd->mdaa_params); kfree(cmd); } diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index f72be7433..f864ce19e 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -35,6 +35,8 @@ #define DIGITAL_ATR_REQ_MIN_SIZE 16 #define DIGITAL_ATR_REQ_MAX_SIZE 64 +#define DIGITAL_ATR_RES_TO_WT(s) ((s) & 0xF) + #define DIGITAL_DID_MAX 14 #define DIGITAL_PAYLOAD_SIZE_MAX 254 @@ -63,6 +65,9 @@ #define DIGITAL_NFC_DEP_DID_BIT_SET(pfb) ((pfb) & DIGITAL_NFC_DEP_PFB_DID_BIT) #define DIGITAL_NFC_DEP_PFB_PNI(pfb) ((pfb) & 0x03) +#define DIGITAL_NFC_DEP_RTOX_VALUE(data) ((data) & 0x3F) +#define DIGITAL_NFC_DEP_RTOX_MAX 59 + #define DIGITAL_NFC_DEP_PFB_I_PDU 0x00 #define DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU 0x40 #define DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU 0x80 @@ -122,6 +127,37 @@ static const u8 digital_payload_bits_map[4] = { [3] = 254 }; +/* Response Waiting Time for ATR_RES PDU in ms + * + * RWT(ATR_RES) = RWT(nfcdep,activation) + dRWT(nfcdep) + dT(nfcdep,initiator) + * + * with: + * RWT(nfcdep,activation) = 4096 * 2^12 / f(c) s + * dRWT(nfcdep) = 16 / f(c) s + * dT(nfcdep,initiator) = 100 ms + * f(c) = 13560000 Hz + */ +#define DIGITAL_ATR_RES_RWT 1337 + +/* Response Waiting Time for other DEP PDUs in ms + * + * max_rwt = rwt + dRWT(nfcdep) + dT(nfcdep,initiator) + * + * with: + * rwt = (256 * 16 / f(c)) * 2^wt s + * dRWT(nfcdep) = 16 / f(c) s + * dT(nfcdep,initiator) = 100 ms + * f(c) = 13560000 Hz + * 0 <= wt <= 14 (given by the target by the TO field of ATR_RES response) + */ +#define DIGITAL_NFC_DEP_IN_MAX_WT 14 +#define DIGITAL_NFC_DEP_TG_MAX_WT 8 +static const u16 digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT + 1] = { + 100, 101, 101, 102, 105, + 110, 119, 139, 177, 255, + 409, 719, 1337, 2575, 5049, +}; + static u8 digital_payload_bits_to_size(u8 payload_bits) { if (payload_bits >= ARRAY_SIZE(digital_payload_bits_map)) @@ -190,8 +226,6 @@ digital_send_dep_data_prep(struct nfc_digital_dev *ddev, struct sk_buff *skb, return ERR_PTR(-ENOMEM); } - skb_reserve(new_skb, ddev->tx_headroom + NFC_HEADER_SIZE + - DIGITAL_NFC_DEP_REQ_RES_HEADROOM); memcpy(skb_put(new_skb, ddev->remote_payload_max), skb->data, ddev->remote_payload_max); skb_pull(skb, ddev->remote_payload_max); @@ -368,8 +402,8 @@ static int digital_in_send_psl_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_psl_res, - target); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_psl_res, target); if (rc) kfree_skb(skb); @@ -382,6 +416,7 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, struct nfc_target *target = arg; struct digital_atr_res *atr_res; u8 gb_len, payload_bits; + u8 wt; int rc; if (IS_ERR(resp)) { @@ -411,6 +446,11 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, atr_res = (struct digital_atr_res *)resp->data; + wt = DIGITAL_ATR_RES_TO_WT(atr_res->to); + if (wt > DIGITAL_NFC_DEP_IN_MAX_WT) + wt = DIGITAL_NFC_DEP_IN_MAX_WT; + ddev->dep_rwt = digital_rwt_map[wt]; + payload_bits = DIGITAL_PAYLOAD_PP_TO_BITS(atr_res->pp); ddev->remote_payload_max = digital_payload_bits_to_size(payload_bits); @@ -492,8 +532,8 @@ int digital_in_send_atr_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_atr_res, - target); + rc = digital_in_send_cmd(ddev, skb, DIGITAL_ATR_RES_RWT, + digital_in_recv_atr_res, target); if (rc) kfree_skb(skb); @@ -524,11 +564,10 @@ static int digital_in_send_ack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; + ddev->saved_skb = pskb_copy(skb, GFP_KERNEL); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) { kfree_skb(skb); kfree_skb(ddev->saved_skb); @@ -562,8 +601,8 @@ static int digital_in_send_nack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -593,8 +632,8 @@ static int digital_in_send_atn(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -607,6 +646,11 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, struct digital_dep_req_res *dep_req; struct sk_buff *skb; int rc; + u16 rwt_int; + + rwt_int = ddev->dep_rwt * rtox; + if (rwt_int > digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT]) + rwt_int = digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT]; skb = digital_skb_alloc(ddev, 1); if (!skb) @@ -627,16 +671,10 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; - - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); - if (rc) { + rc = digital_in_send_cmd(ddev, skb, rwt_int, + digital_in_recv_dep_res, data_exch); + if (rc) kfree_skb(skb); - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; - } return rc; } @@ -644,11 +682,19 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev, struct digital_data_exch *data_exch) { + int rc; + + if (!ddev->saved_skb) + return -EINVAL; + skb_get(ddev->saved_skb); - skb_push(ddev->saved_skb, ddev->saved_skb_len); - return digital_in_send_cmd(ddev, ddev->saved_skb, 1500, - digital_in_recv_dep_res, data_exch); + rc = digital_in_send_cmd(ddev, ddev->saved_skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); + if (rc) + kfree_skb(ddev->saved_skb); + + return rc; } static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, @@ -659,12 +705,13 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, u8 pfb; uint size; int rc; + u8 rtox; if (IS_ERR(resp)) { rc = PTR_ERR(resp); resp = NULL; - if (((rc != -ETIMEDOUT) || ddev->nack_count) && + if ((rc == -EIO || (rc == -ETIMEDOUT && ddev->nack_count)) && (ddev->nack_count++ < DIGITAL_NFC_DEP_N_RETRY_NACK)) { ddev->atn_count = 0; @@ -783,6 +830,12 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, break; case DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU: + if (DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { + PROTOCOL_ERR("14.12.4.5"); + rc = -EIO; + goto exit; + } + if (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni) { PROTOCOL_ERR("14.12.3.3"); rc = -EIO; @@ -792,43 +845,53 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, ddev->curr_nfc_dep_pni = DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni + 1); - if (ddev->chaining_skb && !DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; + if (!ddev->chaining_skb) { + PROTOCOL_ERR("14.12.4.3"); + rc = -EIO; + goto exit; + } - rc = digital_in_send_dep_req(ddev, NULL, - ddev->chaining_skb, - ddev->data_exch); - if (rc) - goto error; + /* The initiator has received a valid ACK. Free the last sent + * PDU and keep on sending chained skb. + */ + kfree_skb(ddev->saved_skb); + ddev->saved_skb = NULL; - return; - } + rc = digital_in_send_dep_req(ddev, NULL, + ddev->chaining_skb, + ddev->data_exch); + if (rc) + goto error; - pr_err("Received a ACK/NACK PDU\n"); - rc = -EINVAL; - goto exit; + goto free_resp; case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU: if (!DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { /* ATN */ rc = digital_in_send_saved_skb(ddev, data_exch); - if (rc) { - kfree_skb(ddev->saved_skb); + if (rc) goto error; - } - return; + goto free_resp; } - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; + if (ddev->atn_count || ddev->nack_count) { + PROTOCOL_ERR("14.12.4.4"); + rc = -EIO; + goto error; + } + + rtox = DIGITAL_NFC_DEP_RTOX_VALUE(resp->data[0]); + if (!rtox || rtox > DIGITAL_NFC_DEP_RTOX_MAX) { + PROTOCOL_ERR("14.8.4.1"); + rc = -EIO; + goto error; + } - rc = digital_in_send_rtox(ddev, data_exch, resp->data[0]); + rc = digital_in_send_rtox(ddev, data_exch, rtox); if (rc) goto error; - kfree_skb(resp); - return; + goto free_resp; } exit: @@ -845,6 +908,11 @@ error: if (rc) kfree_skb(resp); + + return; + +free_resp: + dev_kfree_skb(resp); } int digital_in_send_dep_req(struct nfc_digital_dev *ddev, @@ -876,11 +944,10 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(tmp_skb); - ddev->saved_skb = skb_get(tmp_skb); - ddev->saved_skb_len = tmp_skb->len; + ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL); - rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, tmp_skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) { if (tmp_skb != skb) kfree_skb(tmp_skb); @@ -956,8 +1023,7 @@ static int digital_tg_send_ack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; + ddev->saved_skb = pskb_copy(skb, GFP_KERNEL); rc = digital_tg_send_cmd(ddev, skb, 1500, digital_tg_recv_dep_req, data_exch); @@ -1009,11 +1075,19 @@ static int digital_tg_send_atn(struct nfc_digital_dev *ddev) static int digital_tg_send_saved_skb(struct nfc_digital_dev *ddev) { + int rc; + + if (!ddev->saved_skb) + return -EINVAL; + skb_get(ddev->saved_skb); - skb_push(ddev->saved_skb, ddev->saved_skb_len); - return digital_tg_send_cmd(ddev, ddev->saved_skb, 1500, - digital_tg_recv_dep_req, NULL); + rc = digital_tg_send_cmd(ddev, ddev->saved_skb, 1500, + digital_tg_recv_dep_req, NULL); + if (rc) + kfree_skb(ddev->saved_skb); + + return rc; } static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, @@ -1086,22 +1160,38 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, case DIGITAL_NFC_DEP_PFB_I_PDU: pr_debug("DIGITAL_NFC_DEP_PFB_I_PDU\n"); - if ((ddev->atn_count && (DIGITAL_NFC_DEP_PFB_PNI(pfb - 1) != - ddev->curr_nfc_dep_pni)) || - (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni)) { - PROTOCOL_ERR("14.12.3.4"); - rc = -EIO; - goto exit; - } - if (ddev->atn_count) { + /* The target has received (and replied to) at least one + * ATN DEP_REQ. + */ ddev->atn_count = 0; - rc = digital_tg_send_saved_skb(ddev); - if (rc) - goto exit; + /* pni of resp PDU equal to the target current pni - 1 + * means resp is the previous DEP_REQ PDU received from + * the initiator so the target replies with saved_skb + * which is the previous DEP_RES saved in + * digital_tg_send_dep_res(). + */ + if (DIGITAL_NFC_DEP_PFB_PNI(pfb) == + DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni - 1)) { + rc = digital_tg_send_saved_skb(ddev); + if (rc) + goto exit; - return; + goto free_resp; + } + + /* atn_count > 0 and PDU pni != curr_nfc_dep_pni - 1 + * means the target probably did not received the last + * DEP_REQ PDU sent by the initiator. The target + * fallbacks to normal processing then. + */ + } + + if (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni) { + PROTOCOL_ERR("14.12.3.4"); + rc = -EIO; + goto exit; } kfree_skb(ddev->saved_skb); @@ -1125,51 +1215,64 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, rc = 0; break; case DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU: - if (!DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { /* ACK */ - if ((ddev->atn_count && - (DIGITAL_NFC_DEP_PFB_PNI(pfb - 1) != - ddev->curr_nfc_dep_pni)) || - (DIGITAL_NFC_DEP_PFB_PNI(pfb) != - ddev->curr_nfc_dep_pni) || - !ddev->chaining_skb || !ddev->saved_skb) { + if (DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { /* NACK */ + if (DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) != + ddev->curr_nfc_dep_pni) { rc = -EIO; goto exit; } - if (ddev->atn_count) { - ddev->atn_count = 0; + ddev->atn_count = 0; + rc = digital_tg_send_saved_skb(ddev); + if (rc) + goto exit; + + goto free_resp; + } + + /* ACK */ + if (ddev->atn_count) { + /* The target has previously recevied one or more ATN + * PDUs. + */ + ddev->atn_count = 0; + + /* If the ACK PNI is equal to the target PNI - 1 means + * that the initiator did not receive the previous PDU + * sent by the target so re-send it. + */ + if (DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) == + ddev->curr_nfc_dep_pni) { rc = digital_tg_send_saved_skb(ddev); if (rc) goto exit; - return; + goto free_resp; } - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; + /* Otherwise, the target did not receive the previous + * ACK PDU from the initiator. Fallback to normal + * processing of chained PDU then. + */ + } - rc = digital_tg_send_dep_res(ddev, ddev->chaining_skb); - if (rc) - goto exit; - } else { /* NACK */ - if ((DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) != - ddev->curr_nfc_dep_pni) || - !ddev->saved_skb) { - rc = -EIO; - goto exit; - } + /* Keep on sending chained PDU */ + if (!ddev->chaining_skb || + DIGITAL_NFC_DEP_PFB_PNI(pfb) != + ddev->curr_nfc_dep_pni) { + rc = -EIO; + goto exit; + } - ddev->atn_count = 0; + kfree_skb(ddev->saved_skb); + ddev->saved_skb = NULL; - rc = digital_tg_send_saved_skb(ddev); - if (rc) { - kfree_skb(ddev->saved_skb); - goto exit; - } - } + rc = digital_tg_send_dep_res(ddev, ddev->chaining_skb); + if (rc) + goto exit; - return; + goto free_resp; case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU: if (DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { rc = -EINVAL; @@ -1182,8 +1285,7 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, ddev->atn_count++; - kfree_skb(resp); - return; + goto free_resp; } rc = nfc_tm_data_received(ddev->nfc_dev, resp); @@ -1199,6 +1301,11 @@ exit: if (rc) kfree_skb(resp); + + return; + +free_resp: + dev_kfree_skb(resp); } int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb) @@ -1235,8 +1342,7 @@ int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb) ddev->skb_add_crc(tmp_skb); - ddev->saved_skb = skb_get(tmp_skb); - ddev->saved_skb_len = tmp_skb->len; + ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL); rc = digital_tg_send_cmd(ddev, tmp_skb, 1500, digital_tg_recv_dep_req, NULL); @@ -1420,7 +1526,7 @@ static int digital_tg_send_atr_res(struct nfc_digital_dev *ddev, atr_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN; atr_res->cmd = DIGITAL_CMD_ATR_RES; memcpy(atr_res->nfcid3, atr_req->nfcid3, sizeof(atr_req->nfcid3)); - atr_res->to = 8; + atr_res->to = DIGITAL_NFC_DEP_TG_MAX_WT; ddev->local_payload_max = DIGITAL_PAYLOAD_SIZE_MAX; payload_bits = digital_payload_size_to_bits(ddev->local_payload_max); diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index fb58ed2dd..d9080dec5 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -1257,21 +1257,12 @@ static int digital_tg_config_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech) int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech) { int rc; - u8 *nfcid2; rc = digital_tg_config_nfcf(ddev, rf_tech); if (rc) return rc; - nfcid2 = kzalloc(NFC_NFCID2_MAXSIZE, GFP_KERNEL); - if (!nfcid2) - return -ENOMEM; - - nfcid2[0] = DIGITAL_SENSF_NFCID2_NFC_DEP_B1; - nfcid2[1] = DIGITAL_SENSF_NFCID2_NFC_DEP_B2; - get_random_bytes(nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2); - - return digital_tg_listen(ddev, 300, digital_tg_recv_sensf_req, nfcid2); + return digital_tg_listen(ddev, 300, digital_tg_recv_sensf_req, NULL); } void digital_tg_recv_md_req(struct nfc_digital_dev *ddev, void *arg, diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index 1399a03fa..3d699cbc7 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -133,36 +133,29 @@ void nfc_llc_free(struct nfc_llc *llc) kfree(llc); } -inline void nfc_llc_get_rx_head_tail_room(struct nfc_llc *llc, int *rx_headroom, - int *rx_tailroom) -{ - *rx_headroom = llc->rx_headroom; - *rx_tailroom = llc->rx_tailroom; -} - -inline int nfc_llc_start(struct nfc_llc *llc) +int nfc_llc_start(struct nfc_llc *llc) { return llc->ops->start(llc); } EXPORT_SYMBOL(nfc_llc_start); -inline int nfc_llc_stop(struct nfc_llc *llc) +int nfc_llc_stop(struct nfc_llc *llc) { return llc->ops->stop(llc); } EXPORT_SYMBOL(nfc_llc_stop); -inline void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb) +void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb) { llc->ops->rcv_from_drv(llc, skb); } -inline int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) +int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) { return llc->ops->xmit_from_hci(llc, skb); } -inline void *nfc_llc_get_data(struct nfc_llc *llc) +void *nfc_llc_get_data(struct nfc_llc *llc) { return llc->data; } diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 3425532c3..c5959ce50 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -438,19 +438,17 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) goto error_tlv; } - if (service_name_tlv != NULL) - skb = llcp_add_tlv(skb, service_name_tlv, - service_name_tlv_length); - - skb = llcp_add_tlv(skb, miux_tlv, miux_tlv_length); - skb = llcp_add_tlv(skb, rw_tlv, rw_tlv_length); + llcp_add_tlv(skb, service_name_tlv, service_name_tlv_length); + llcp_add_tlv(skb, miux_tlv, miux_tlv_length); + llcp_add_tlv(skb, rw_tlv, rw_tlv_length); skb_queue_tail(&local->tx_queue, skb); - return 0; + err = 0; error_tlv: - pr_err("error %d\n", err); + if (err) + pr_err("error %d\n", err); kfree(service_name_tlv); kfree(miux_tlv); @@ -493,15 +491,16 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) goto error_tlv; } - skb = llcp_add_tlv(skb, miux_tlv, miux_tlv_length); - skb = llcp_add_tlv(skb, rw_tlv, rw_tlv_length); + llcp_add_tlv(skb, miux_tlv, miux_tlv_length); + llcp_add_tlv(skb, rw_tlv, rw_tlv_length); skb_queue_tail(&local->tx_queue, skb); - return 0; + err = 0; error_tlv: - pr_err("error %d\n", err); + if (err) + pr_err("error %d\n", err); kfree(miux_tlv); kfree(rw_tlv); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 98876274a..e69786c68 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -732,9 +732,8 @@ static void nfc_llcp_tx_work(struct work_struct *work) int ret; pr_debug("Sending pending skb\n"); - print_hex_dump(KERN_DEBUG, "LLCP Tx: ", - DUMP_PREFIX_OFFSET, 16, 1, - skb->data, skb->len, true); + print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->data, skb->len, true); if (ptype == LLCP_PDU_DISC && sk != NULL && sk->sk_state == LLCP_DISCONNECTING) { @@ -1412,8 +1411,8 @@ static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb) pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap); if (ptype != LLCP_PDU_SYMM) - print_hex_dump(KERN_DEBUG, "LLCP Rx: ", DUMP_PREFIX_OFFSET, - 16, 1, skb->data, skb->len, true); + print_hex_dump_debug("LLCP Rx: ", DUMP_PREFIX_OFFSET, 16, 1, + skb->data, skb->len, true); switch (ptype) { case LLCP_PDU_SYMM: diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 9a3eb7a0e..1ecbd7715 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -750,6 +750,14 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, if (likely(vport)) { u16 mru = OVS_CB(skb)->mru; + u32 cutlen = OVS_CB(skb)->cutlen; + + if (unlikely(cutlen > 0)) { + if (skb->len - cutlen > ETH_HLEN) + pskb_trim(skb, skb->len - cutlen); + else + pskb_trim(skb, ETH_HLEN); + } if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { ovs_vport_send(vport, skb); @@ -775,7 +783,8 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, - const struct nlattr *actions, int actions_len) + const struct nlattr *actions, int actions_len, + uint32_t cutlen) { struct dp_upcall_info upcall; const struct nlattr *a; @@ -822,7 +831,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, } /* End of switch. */ } - return ovs_dp_upcall(dp, skb, key, &upcall); + return ovs_dp_upcall(dp, skb, key, &upcall, cutlen); } static int sample(struct datapath *dp, struct sk_buff *skb, @@ -832,6 +841,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, const struct nlattr *acts_list = NULL; const struct nlattr *a; int rem; + u32 cutlen = 0; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -858,13 +868,24 @@ static int sample(struct datapath *dp, struct sk_buff *skb, return 0; /* The only known usage of sample action is having a single user-space + * action, or having a truncate action followed by a single user-space * action. Treat this usage as a special case. * The output_userspace() should clone the skb to be sent to the * user space. This skb will be consumed by its caller. */ + if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) { + struct ovs_action_trunc *trunc = nla_data(a); + + if (skb->len > trunc->max_len) + cutlen = skb->len - trunc->max_len; + + a = nla_next(a, &rem); + } + if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a, actions, actions_len); + return output_userspace(dp, skb, key, a, actions, + actions_len, cutlen); skb = skb_clone(skb, GFP_ATOMIC); if (!skb) @@ -1051,6 +1072,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, if (out_skb) do_output(dp, out_skb, prev_port, key); + OVS_CB(skb)->cutlen = 0; prev_port = -1; } @@ -1059,8 +1081,18 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, prev_port = nla_get_u32(a); break; + case OVS_ACTION_ATTR_TRUNC: { + struct ovs_action_trunc *trunc = nla_data(a); + + if (skb->len > trunc->max_len) + OVS_CB(skb)->cutlen = skb->len - trunc->max_len; + break; + } + case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, key, a, attr, len); + output_userspace(dp, skb, key, a, attr, + len, OVS_CB(skb)->cutlen); + OVS_CB(skb)->cutlen = 0; break; case OVS_ACTION_ATTR_HASH: diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index d84312584..e054a748f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -135,7 +135,7 @@ static void ovs_ct_get_labels(const struct nf_conn *ct, struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; if (cl) { - size_t len = cl->words * sizeof(long); + size_t len = sizeof(cl->bits); if (len > OVS_CT_LABELS_LEN) len = OVS_CT_LABELS_LEN; @@ -274,7 +274,7 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, nf_ct_labels_ext_add(ct); cl = nf_ct_labels_find(ct); } - if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN) + if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) return -ENOSPC; err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, @@ -433,7 +433,6 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; - enum ip_conntrack_info ctinfo; struct nf_conn *ct; unsigned int dataoff; u8 protonum; @@ -458,13 +457,8 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, ct = nf_ct_tuplehash_to_ctrack(h); - ctinfo = ovs_ct_get_info(h); - if (ctinfo == IP_CT_NEW) { - /* This should not happen. */ - WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct); - } skb->nfct = &ct->ct_general; - skb->nfctinfo = ctinfo; + skb->nfctinfo = ovs_ct_get_info(h); return ct; } @@ -834,6 +828,17 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, return 0; } +static bool labels_nonzero(const struct ovs_key_ct_labels *labels) +{ + size_t i; + + for (i = 0; i < sizeof(*labels); i++) + if (labels->ct_labels[i]) + return true; + + return false; +} + /* Lookup connection and confirm if unconfirmed. */ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, @@ -844,24 +849,32 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; - /* This is a no-op if the connection has already been confirmed. */ + + /* Apply changes before confirming the connection so that the initial + * conntrack NEW netlink event carries the values given in the CT + * action. + */ + if (info->mark.mask) { + err = ovs_ct_set_mark(skb, key, info->mark.value, + info->mark.mask); + if (err) + return err; + } + if (labels_nonzero(&info->labels.mask)) { + err = ovs_ct_set_labels(skb, key, &info->labels.value, + &info->labels.mask); + if (err) + return err; + } + /* This will take care of sending queued events even if the connection + * is already confirmed. + */ if (nf_conntrack_confirm(skb) != NF_ACCEPT) return -EINVAL; return 0; } -static bool labels_nonzero(const struct ovs_key_ct_labels *labels) -{ - size_t i; - - for (i = 0; i < sizeof(*labels); i++) - if (labels->ct_labels[i]) - return true; - - return false; -} - /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero * value if 'skb' is freed. */ @@ -886,19 +899,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, err = ovs_ct_commit(net, key, info, skb); else err = ovs_ct_lookup(net, key, info, skb); - if (err) - goto err; - if (info->mark.mask) { - err = ovs_ct_set_mark(skb, key, info->mark.value, - info->mark.mask); - if (err) - goto err; - } - if (labels_nonzero(&info->labels.mask)) - err = ovs_ct_set_labels(skb, key, &info->labels.value, - &info->labels.mask); -err: skb_push(skb, nh_ofs); if (err) kfree_skb(skb); @@ -1155,6 +1156,20 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, } } +#ifdef CONFIG_NF_CONNTRACK_MARK + if (!info->commit && info->mark.mask) { + OVS_NLERR(log, + "Setting conntrack mark requires 'commit' flag."); + return -EINVAL; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + if (!info->commit && labels_nonzero(&info->labels.mask)) { + OVS_NLERR(log, + "Setting conntrack labels requires 'commit' flag."); + return -EINVAL; + } +#endif if (rem > 0) { OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); return -EINVAL; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 856bd8dba..524c0fd30 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -137,10 +137,12 @@ EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held); static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, - const struct dp_upcall_info *); + const struct dp_upcall_info *, + uint32_t cutlen); static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, - const struct dp_upcall_info *); + const struct dp_upcall_info *, + uint32_t cutlen); /* Must be called with rcu_read_lock. */ static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) @@ -275,7 +277,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) upcall.cmd = OVS_PACKET_CMD_MISS; upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; - error = ovs_dp_upcall(dp, skb, key, &upcall); + error = ovs_dp_upcall(dp, skb, key, &upcall, 0); if (unlikely(error)) kfree_skb(skb); else @@ -300,7 +302,8 @@ out: int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { struct dp_stats_percpu *stats; int err; @@ -311,9 +314,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, } if (!skb_is_gso(skb)) - err = queue_userspace_packet(dp, skb, key, upcall_info); + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else - err = queue_gso_packets(dp, skb, key, upcall_info); + err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); if (err) goto err; @@ -331,7 +334,8 @@ err: static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; @@ -360,7 +364,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; - err = queue_userspace_packet(dp, skb, key, upcall_info); + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; @@ -383,7 +387,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ - + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) @@ -416,7 +421,8 @@ static void pad_packet(struct datapath *dp, struct sk_buff *skb) static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; @@ -461,7 +467,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, else hlen = skb->len; - len = upcall_msg_size(upcall_info, hlen); + len = upcall_msg_size(upcall_info, hlen - cutlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; @@ -509,15 +515,25 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, pad_packet(dp, user_skb); } + /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ + if (cutlen > 0) { + if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, + skb->len)) { + err = -ENOBUFS; + goto out; + } + pad_packet(dp, user_skb); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { err = -ENOBUFS; goto out; } - nla->nla_len = nla_attr_size(skb->len); + nla->nla_len = nla_attr_size(skb->len - cutlen); - err = skb_zerocopy(user_skb, skb, skb->len, hlen); + err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); if (err) goto out; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 427e39a04..ab85c1cae 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -100,11 +100,13 @@ struct datapath { * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not + * @cutlen: The number of bytes from the packet end to be removed. * fragmented. */ struct ovs_skb_cb { struct vport *input_vport; u16 mru; + u32 cutlen; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -194,7 +196,8 @@ extern struct genl_family dp_vport_genl_family; void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, - const struct sw_flow_key *, const struct dp_upcall_info *); + const struct sw_flow_key *, const struct dp_upcall_info *, + uint32_t cutlen); const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 0bb650f4f..c78a6a147 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2229,6 +2229,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, + [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2255,6 +2256,14 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_TRUNC: { + const struct ovs_action_trunc *trunc = nla_data(a); + + if (trunc->max_len < ETH_HLEN) + return -EINVAL; + break; + } + case OVS_ACTION_ATTR_HASH: { const struct ovs_action_hash *act_hash = nla_data(a); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 1a1fcec88..5aaf3babf 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -93,7 +93,14 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP); + if (err < 0) { + rtnl_delete_link(dev); + rtnl_unlock(); + ovs_vport_free(vport); + goto error; + } + rtnl_unlock(); return vport; error: diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 7f8897f33..0e72d95b0 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -54,6 +54,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) struct net *net = ovs_dp_get_net(parms->dp); struct net_device *dev; struct vport *vport; + int err; vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); if (IS_ERR(vport)) @@ -67,9 +68,15 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - dev_change_flags(dev, dev->flags | IFF_UP); - rtnl_unlock(); + err = dev_change_flags(dev, dev->flags | IFF_UP); + if (err < 0) { + rtnl_delete_link(dev); + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_PTR(err); + } + rtnl_unlock(); return vport; } diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 2ee48e447..95c36147a 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -140,7 +140,7 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) static void internal_set_rx_headroom(struct net_device *dev, int new_hr) { - dev->needed_headroom = new_hr; + dev->needed_headroom = new_hr < 0 ? 0 : new_hr; } static const struct net_device_ops internal_dev_netdev_ops = { @@ -195,7 +195,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) } vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, do_setup); + parms->name, NET_NAME_USER, do_setup); if (!vport->dev) { err = -ENOMEM; goto error_free_vport; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 5eb769434..7eb955e45 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -130,7 +130,14 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP); + if (err < 0) { + rtnl_delete_link(dev); + rtnl_unlock(); + ovs_vport_free(vport); + goto error; + } + rtnl_unlock(); return vport; error: diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 31cbc8c5c..6b21fd068 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -444,6 +444,7 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; + OVS_CB(skb)->cutlen = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b43c4015b..33a4697d5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1588,13 +1588,9 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, if (copy_from_user(&fd, data, len)) return -EFAULT; - new = bpf_prog_get(fd); + new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); - if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(new); - return -EINVAL; - } __fanout_set_data_bpf(po->fanout, new); return 0; @@ -1977,40 +1973,8 @@ static int __packet_rcv_vnet(const struct sk_buff *skb, { *vnet_hdr = (const struct virtio_net_hdr) { 0 }; - if (skb_is_gso(skb)) { - struct skb_shared_info *sinfo = skb_shinfo(skb); - - /* This is a hint as to how much should be linear. */ - vnet_hdr->hdr_len = - __cpu_to_virtio16(vio_le(), skb_headlen(skb)); - vnet_hdr->gso_size = - __cpu_to_virtio16(vio_le(), sinfo->gso_size); - - if (sinfo->gso_type & SKB_GSO_TCPV4) - vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else if (sinfo->gso_type & SKB_GSO_TCPV6) - vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - else if (sinfo->gso_type & SKB_GSO_UDP) - vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; - else if (sinfo->gso_type & SKB_GSO_FCOE) - return -EINVAL; - else - BUG(); - - if (sinfo->gso_type & SKB_GSO_TCP_ECN) - vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; - } else - vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(), - skb_checksum_start_offset(skb)); - vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(), - skb->csum_offset); - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; - } /* else everything is zero */ + if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le())) + BUG(); return 0; } diff --git a/net/rds/bind.c b/net/rds/bind.c index b22ea9565..095f6ce58 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -81,6 +81,8 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) if (*port != 0) { rover = be16_to_cpu(*port); + if (rover == RDS_FLAG_PROBE_PORT) + return -EINVAL; last = rover; } else { rover = max_t(u16, prandom_u32(), 2); @@ -91,12 +93,16 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) if (rover == 0) rover++; + if (rover == RDS_FLAG_PROBE_PORT) + continue; key = ((u64)addr << 32) | cpu_to_be16(rover); if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms)) continue; rs->rs_bound_key = key; rs->rs_bound_addr = addr; + net_get_random_once(&rs->rs_hash_initval, + sizeof(rs->rs_hash_initval)); rs->rs_bound_port = cpu_to_be16(rover); rs->rs_bound_node.next = NULL; rds_sock_addref(rs); diff --git a/net/rds/cong.c b/net/rds/cong.c index 6641bcf7c..8398fee7c 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -235,7 +235,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map) * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, + &conn->c_path[0].cp_send_w, 0); } } diff --git a/net/rds/connection.c b/net/rds/connection.c index e3b118cae..f5058559b 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -95,14 +95,16 @@ static struct rds_connection *rds_conn_lookup(struct net *net, * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ -static void rds_conn_reset(struct rds_connection *conn) +static void rds_conn_path_reset(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; + rdsdebug("connection %pI4 to %pI4 reset\n", &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); - rds_send_reset(conn); - conn->c_flags = 0; + rds_send_path_reset(cp); + cp->cp_flags = 0; /* Do not clear next_rx_seq here, else we cannot distinguish * retransmitted packets from new packets, and will hand all @@ -110,6 +112,32 @@ static void rds_conn_reset(struct rds_connection *conn) * reliability guarantees of RDS. */ } +static void __rds_conn_path_init(struct rds_connection *conn, + struct rds_conn_path *cp, bool is_outgoing) +{ + spin_lock_init(&cp->cp_lock); + cp->cp_next_tx_seq = 1; + init_waitqueue_head(&cp->cp_waitq); + INIT_LIST_HEAD(&cp->cp_send_queue); + INIT_LIST_HEAD(&cp->cp_retrans); + + cp->cp_conn = conn; + atomic_set(&cp->cp_state, RDS_CONN_DOWN); + cp->cp_send_gen = 0; + /* cp_outgoing is per-path. So we can only set it here + * for the single-path transports. + */ + if (!conn->c_trans->t_mp_capable) + cp->cp_outgoing = (is_outgoing ? 1 : 0); + cp->cp_reconnect_jiffies = 0; + INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); + INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); + INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); + INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); + mutex_init(&cp->cp_cm_lock); + cp->cp_flags = 0; +} + /* * There is only every one 'conn' for a given pair of addresses in the * system at a time. They contain messages to be retransmitted and so @@ -127,7 +155,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; unsigned long flags; - int ret; + int ret, i; rcu_read_lock(); conn = rds_conn_lookup(net, head, laddr, faddr, trans); @@ -153,13 +181,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, INIT_HLIST_NODE(&conn->c_hash_node); conn->c_laddr = laddr; conn->c_faddr = faddr; - spin_lock_init(&conn->c_lock); - conn->c_next_tx_seq = 1; - rds_conn_net_set(conn, net); - init_waitqueue_head(&conn->c_waitq); - INIT_LIST_HEAD(&conn->c_send_queue); - INIT_LIST_HEAD(&conn->c_retrans); + rds_conn_net_set(conn, net); ret = rds_cong_get_maps(conn); if (ret) { @@ -188,6 +211,12 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn->c_trans = trans; + init_waitqueue_head(&conn->c_hs_waitq); + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + __rds_conn_path_init(conn, &conn->c_path[i], + is_outgoing); + conn->c_path[i].cp_index = i; + } ret = trans->conn_alloc(conn, gfp); if (ret) { kmem_cache_free(rds_conn_slab, conn); @@ -195,17 +224,6 @@ static struct rds_connection *__rds_conn_create(struct net *net, goto out; } - atomic_set(&conn->c_state, RDS_CONN_DOWN); - conn->c_send_gen = 0; - conn->c_outgoing = (is_outgoing ? 1 : 0); - conn->c_reconnect_jiffies = 0; - INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); - INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); - INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); - INIT_WORK(&conn->c_down_w, rds_shutdown_worker); - mutex_init(&conn->c_cm_lock); - conn->c_flags = 0; - rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", conn, &laddr, &faddr, trans->t_name ? trans->t_name : "[unknown]", @@ -222,7 +240,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (parent) { /* Creating passive conn */ if (parent->c_passive) { - trans->conn_free(conn->c_transport_data); + trans->conn_free(conn->c_path[0].cp_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { @@ -236,7 +254,18 @@ static struct rds_connection *__rds_conn_create(struct net *net, found = rds_conn_lookup(net, head, laddr, faddr, trans); if (found) { - trans->conn_free(conn->c_transport_data); + struct rds_conn_path *cp; + int i; + + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + cp = &conn->c_path[i]; + /* The ->conn_alloc invocation may have + * allocated resource for all paths, so all + * of them may have to be freed here. + */ + if (cp->cp_transport_data) + trans->conn_free(cp->cp_transport_data); + } kmem_cache_free(rds_conn_slab, conn); conn = found; } else { @@ -267,10 +296,12 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net, } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); -void rds_conn_shutdown(struct rds_connection *conn) +void rds_conn_shutdown(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; + /* shut it down unless it's down already */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { /* * Quiesce the connection mgmt handlers before we start tearing * things down. We don't hold the mutex for the entire @@ -278,35 +309,38 @@ void rds_conn_shutdown(struct rds_connection *conn) * deadlocking with the CM handler. Instead, the CM event * handler is supposed to check for state DISCONNECTING */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) - && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { - rds_conn_error(conn, "shutdown called in state %d\n", - atomic_read(&conn->c_state)); - mutex_unlock(&conn->c_cm_lock); + mutex_lock(&cp->cp_cm_lock); + if (!rds_conn_path_transition(cp, RDS_CONN_UP, + RDS_CONN_DISCONNECTING) && + !rds_conn_path_transition(cp, RDS_CONN_ERROR, + RDS_CONN_DISCONNECTING)) { + rds_conn_path_error(cp, + "shutdown called in state %d\n", + atomic_read(&cp->cp_state)); + mutex_unlock(&cp->cp_cm_lock); return; } - mutex_unlock(&conn->c_cm_lock); + mutex_unlock(&cp->cp_cm_lock); - wait_event(conn->c_waitq, - !test_bit(RDS_IN_XMIT, &conn->c_flags)); - wait_event(conn->c_waitq, - !test_bit(RDS_RECV_REFILL, &conn->c_flags)); + wait_event(cp->cp_waitq, + !test_bit(RDS_IN_XMIT, &cp->cp_flags)); + wait_event(cp->cp_waitq, + !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); - conn->c_trans->conn_shutdown(conn); - rds_conn_reset(conn); + conn->c_trans->conn_path_shutdown(cp); + rds_conn_path_reset(cp); - if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, + RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing * down the connection, and someone unloads the rds module. * Quite reproduceable with loopback connections. * Mostly harmless. */ - rds_conn_error(conn, - "%s: failed to transition to state DOWN, " - "current state is %d\n", - __func__, - atomic_read(&conn->c_state)); + rds_conn_path_error(cp, "%s: failed to transition " + "to state DOWN, current state " + "is %d\n", __func__, + atomic_read(&cp->cp_state)); return; } } @@ -315,18 +349,47 @@ void rds_conn_shutdown(struct rds_connection *conn) * The passive side of an IB loopback connection is never added * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ - cancel_delayed_work_sync(&conn->c_conn_w); + cancel_delayed_work_sync(&cp->cp_conn_w); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); - if (conn->c_trans->t_type != RDS_TRANS_TCP || - conn->c_outgoing == 1) - rds_queue_reconnect(conn); + rds_queue_reconnect(cp); } else { rcu_read_unlock(); } } +/* destroy a single rds_conn_path. rds_conn_destroy() iterates over + * all paths using rds_conn_path_destroy() + */ +static void rds_conn_path_destroy(struct rds_conn_path *cp) +{ + struct rds_message *rm, *rtmp; + + if (!cp->cp_transport_data) + return; + + rds_conn_path_drop(cp); + flush_work(&cp->cp_down_w); + + /* make sure lingering queued work won't try to ref the conn */ + cancel_delayed_work_sync(&cp->cp_send_w); + cancel_delayed_work_sync(&cp->cp_recv_w); + + /* tear down queued messages */ + list_for_each_entry_safe(rm, rtmp, + &cp->cp_send_queue, + m_conn_item) { + list_del_init(&rm->m_conn_item); + BUG_ON(!list_empty(&rm->m_sock_item)); + rds_message_put(rm); + } + if (cp->cp_xmit_rm) + rds_message_put(cp->cp_xmit_rm); + + cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); +} + /* * Stop and free a connection. * @@ -336,8 +399,9 @@ void rds_conn_shutdown(struct rds_connection *conn) */ void rds_conn_destroy(struct rds_connection *conn) { - struct rds_message *rm, *rtmp; unsigned long flags; + int i; + struct rds_conn_path *cp; rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, @@ -350,25 +414,11 @@ void rds_conn_destroy(struct rds_connection *conn) synchronize_rcu(); /* shut the connection down */ - rds_conn_drop(conn); - flush_work(&conn->c_down_w); - - /* make sure lingering queued work won't try to ref the conn */ - cancel_delayed_work_sync(&conn->c_send_w); - cancel_delayed_work_sync(&conn->c_recv_w); - - /* tear down queued messages */ - list_for_each_entry_safe(rm, rtmp, - &conn->c_send_queue, - m_conn_item) { - list_del_init(&rm->m_conn_item); - BUG_ON(!list_empty(&rm->m_sock_item)); - rds_message_put(rm); + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + cp = &conn->c_path[i]; + rds_conn_path_destroy(cp); + BUG_ON(!list_empty(&cp->cp_retrans)); } - if (conn->c_xmit_rm) - rds_message_put(conn->c_xmit_rm); - - conn->c_trans->conn_free(conn->c_transport_data); /* * The congestion maps aren't freed up here. They're @@ -377,7 +427,6 @@ void rds_conn_destroy(struct rds_connection *conn) */ rds_cong_remove_conn(conn); - BUG_ON(!list_empty(&conn->c_retrans)); kmem_cache_free(rds_conn_slab, conn); spin_lock_irqsave(&rds_conn_lock, flags); @@ -398,6 +447,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, unsigned int total = 0; unsigned long flags; size_t i; + int j; len /= sizeof(struct rds_info_message); @@ -406,23 +456,32 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { - if (want_send) - list = &conn->c_send_queue; - else - list = &conn->c_retrans; - - spin_lock_irqsave(&conn->c_lock, flags); - - /* XXX too lazy to maintain counts.. */ - list_for_each_entry(rm, list, m_conn_item) { - total++; - if (total <= len) - rds_inc_info_copy(&rm->m_inc, iter, - conn->c_laddr, - conn->c_faddr, 0); + struct rds_conn_path *cp; + + for (j = 0; j < RDS_MPATH_WORKERS; j++) { + cp = &conn->c_path[j]; + if (want_send) + list = &cp->cp_send_queue; + else + list = &cp->cp_retrans; + + spin_lock_irqsave(&cp->cp_lock, flags); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(rm, list, m_conn_item) { + total++; + if (total <= len) + rds_inc_info_copy(&rm->m_inc, + iter, + conn->c_laddr, + conn->c_faddr, + 0); + } + + spin_unlock_irqrestore(&cp->cp_lock, flags); + if (!conn->c_trans->t_mp_capable) + break; } - - spin_unlock_irqrestore(&conn->c_lock, flags); } } rcu_read_unlock(); @@ -484,27 +543,72 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); -static int rds_conn_info_visitor(struct rds_connection *conn, - void *buffer) +void rds_walk_conn_path_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_conn_path *, void *), + size_t item_len) +{ + u64 buffer[(item_len + 7) / 8]; + struct hlist_head *head; + struct rds_connection *conn; + size_t i; + int j; + + rcu_read_lock(); + + lens->nr = 0; + lens->each = item_len; + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { + struct rds_conn_path *cp; + + for (j = 0; j < RDS_MPATH_WORKERS; j++) { + cp = &conn->c_path[j]; + + /* XXX no cp_lock usage.. */ + if (!visitor(cp, buffer)) + continue; + if (!conn->c_trans->t_mp_capable) + break; + } + + /* We copy as much as we can fit in the buffer, + * but we count all items so that the caller + * can resize the buffer. + */ + if (len >= item_len) { + rds_info_copy(iter, buffer, item_len); + len -= item_len; + } + lens->nr++; + } + } + rcu_read_unlock(); +} + +static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; - cinfo->next_tx_seq = conn->c_next_tx_seq; - cinfo->next_rx_seq = conn->c_next_rx_seq; - cinfo->laddr = conn->c_laddr; - cinfo->faddr = conn->c_faddr; - strncpy(cinfo->transport, conn->c_trans->t_name, + cinfo->next_tx_seq = cp->cp_next_tx_seq; + cinfo->next_rx_seq = cp->cp_next_rx_seq; + cinfo->laddr = cp->cp_conn->c_laddr; + cinfo->faddr = cp->cp_conn->c_faddr; + strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name, sizeof(cinfo->transport)); cinfo->flags = 0; - rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), + rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, - atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, + atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo->flags, - atomic_read(&conn->c_state) == RDS_CONN_UP, + atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); return 1; } @@ -513,7 +617,7 @@ static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { - rds_for_each_conn_info(sock, len, iter, lens, + rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, sizeof(struct rds_info_connection)); } @@ -553,10 +657,17 @@ void rds_conn_exit(void) /* * Force a disconnect */ +void rds_conn_path_drop(struct rds_conn_path *cp) +{ + atomic_set(&cp->cp_state, RDS_CONN_ERROR); + queue_work(rds_wq, &cp->cp_down_w); +} +EXPORT_SYMBOL_GPL(rds_conn_path_drop); + void rds_conn_drop(struct rds_connection *conn) { - atomic_set(&conn->c_state, RDS_CONN_ERROR); - queue_work(rds_wq, &conn->c_down_w); + WARN_ON(conn->c_trans->t_mp_capable); + rds_conn_path_drop(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_drop); @@ -564,11 +675,17 @@ EXPORT_SYMBOL_GPL(rds_conn_drop); * If the connection is down, trigger a connect. We may have scheduled a * delayed reconnect however - in this case we should not interfere. */ +void rds_conn_path_connect_if_down(struct rds_conn_path *cp) +{ + if (rds_conn_path_state(cp) == RDS_CONN_DOWN && + !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); +} + void rds_conn_connect_if_down(struct rds_connection *conn) { - if (rds_conn_state(conn) == RDS_CONN_DOWN && - !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + WARN_ON(conn->c_trans->t_mp_capable); + rds_conn_path_connect_if_down(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); @@ -586,3 +703,15 @@ __rds_conn_error(struct rds_connection *conn, const char *fmt, ...) rds_conn_drop(conn); } + +void +__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); + + rds_conn_path_drop(cp); +} diff --git a/net/rds/ib.c b/net/rds/ib.c index b5342fdda..7eaf887e4 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -40,6 +40,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "ib.h" #include "ib_mr.h" @@ -380,15 +381,15 @@ void rds_ib_exit(void) struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, - .xmit_complete = rds_ib_xmit_complete, + .xmit_path_complete = rds_ib_xmit_path_complete, .xmit = rds_ib_xmit, .xmit_rdma = rds_ib_xmit_rdma, .xmit_atomic = rds_ib_xmit_atomic, - .recv = rds_ib_recv, + .recv_path = rds_ib_recv_path, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, - .conn_connect = rds_ib_conn_connect, - .conn_shutdown = rds_ib_conn_shutdown, + .conn_path_connect = rds_ib_conn_path_connect, + .conn_path_shutdown = rds_ib_conn_path_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, .inc_free = rds_ib_inc_free, .cm_initiate_connect = rds_ib_cm_initiate_connect, diff --git a/net/rds/ib.h b/net/rds/ib.h index 627fb79ae..046f7508c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -328,8 +328,8 @@ extern struct list_head ib_nodev_conns; /* ib_cm.c */ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); void rds_ib_conn_free(void *arg); -int rds_ib_conn_connect(struct rds_connection *conn); -void rds_ib_conn_shutdown(struct rds_connection *conn); +int rds_ib_conn_path_connect(struct rds_conn_path *cp); +void rds_ib_conn_path_shutdown(struct rds_conn_path *cp); void rds_ib_state_change(struct sock *sk); int rds_ib_listen_init(void); void rds_ib_listen_stop(void); @@ -354,7 +354,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); /* ib_recv.c */ int rds_ib_recv_init(void); void rds_ib_recv_exit(void); -int rds_ib_recv(struct rds_connection *conn); +int rds_ib_recv_path(struct rds_conn_path *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); @@ -384,7 +384,7 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ -void rds_ib_xmit_complete(struct rds_connection *conn); +void rds_ib_xmit_path_complete(struct rds_conn_path *cp); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 7c2a65a6a..5b2ab95af 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -36,6 +36,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "ib.h" @@ -273,7 +274,7 @@ static void rds_ib_tasklet_fn_send(unsigned long data) if (rds_conn_up(conn) && (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || test_bit(0, &conn->c_map_queued))) - rds_send_xmit(ic->conn); + rds_send_xmit(&ic->conn->c_path[0]); } static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq, @@ -684,8 +685,9 @@ out: return ret; } -int rds_ib_conn_connect(struct rds_connection *conn) +int rds_ib_conn_path_connect(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; struct sockaddr_in src, dest; int ret; @@ -730,8 +732,9 @@ out: * so that it can be called at any point during startup. In fact it * can be called multiple times for a given connection. */ -void rds_ib_conn_shutdown(struct rds_connection *conn) +void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; int err = 0; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index f7164ac1f..977f69886 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -35,6 +35,7 @@ #include #include +#include "rds_single_path.h" #include "ib_mr.h" struct workqueue_struct *rds_ib_mr_wq; @@ -618,7 +619,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int rds_ib_mr_init(void) { - rds_ib_mr_wq = create_workqueue("rds_mr_flushd"); + rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index abc8cc805..606a11f68 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -36,6 +36,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "ib.h" @@ -1008,8 +1009,9 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, rds_ib_recv_refill(conn, 0, GFP_NOWAIT); } -int rds_ib_recv(struct rds_connection *conn) +int rds_ib_recv_path(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; int ret = 0; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index f27d2c82b..84d90c973 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -36,6 +36,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "ib.h" @@ -979,8 +980,9 @@ out: return ret; } -void rds_ib_xmit_complete(struct rds_connection *conn) +void rds_ib_xmit_path_complete(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; /* We may have a pending ACK or window update we were unable diff --git a/net/rds/loop.c b/net/rds/loop.c index 814173b46..f2bf78de5 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -34,6 +34,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "loop.h" @@ -101,7 +102,7 @@ static void rds_loop_inc_free(struct rds_incoming *inc) } /* we need to at least give the thread something to succeed */ -static int rds_loop_recv(struct rds_connection *conn) +static int rds_loop_recv_path(struct rds_conn_path *cp) { return 0; } @@ -149,13 +150,13 @@ static void rds_loop_conn_free(void *arg) kfree(lc); } -static int rds_loop_conn_connect(struct rds_connection *conn) +static int rds_loop_conn_path_connect(struct rds_conn_path *cp) { - rds_connect_complete(conn); + rds_connect_complete(cp->cp_conn); return 0; } -static void rds_loop_conn_shutdown(struct rds_connection *conn) +static void rds_loop_conn_path_shutdown(struct rds_conn_path *cp) { } @@ -184,11 +185,11 @@ void rds_loop_exit(void) */ struct rds_transport rds_loop_transport = { .xmit = rds_loop_xmit, - .recv = rds_loop_recv, + .recv_path = rds_loop_recv_path, .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, - .conn_connect = rds_loop_conn_connect, - .conn_shutdown = rds_loop_conn_shutdown, + .conn_path_connect = rds_loop_conn_path_connect, + .conn_path_shutdown = rds_loop_conn_path_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, .inc_free = rds_loop_inc_free, .t_name = "loopback", diff --git a/net/rds/message.c b/net/rds/message.c index 756c73729..6cb910615 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -41,6 +41,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +[RDS_EXTHDR_NPATHS] = sizeof(u16), }; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 7220bebcf..345f09059 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -33,6 +33,7 @@ #include #include +#include "rds_single_path.h" #include "rdma_transport.h" #include "ib.h" diff --git a/net/rds/rds.h b/net/rds/rds.h index 387df5f32..b2d17f0fa 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -84,56 +84,73 @@ enum { #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 +/* Max number of multipaths per RDS connection. Must be a power of 2 */ +#define RDS_MPATH_WORKERS 8 +#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ + (rs)->rs_hash_initval) & ((n) - 1)) + +/* Per mpath connection state */ +struct rds_conn_path { + struct rds_connection *cp_conn; + struct rds_message *cp_xmit_rm; + unsigned long cp_xmit_sg; + unsigned int cp_xmit_hdr_off; + unsigned int cp_xmit_data_off; + unsigned int cp_xmit_atomic_sent; + unsigned int cp_xmit_rdma_sent; + unsigned int cp_xmit_data_sent; + + spinlock_t cp_lock; /* protect msg queues */ + u64 cp_next_tx_seq; + struct list_head cp_send_queue; + struct list_head cp_retrans; + + u64 cp_next_rx_seq; + + void *cp_transport_data; + + atomic_t cp_state; + unsigned long cp_send_gen; + unsigned long cp_flags; + unsigned long cp_reconnect_jiffies; + struct delayed_work cp_send_w; + struct delayed_work cp_recv_w; + struct delayed_work cp_conn_w; + struct work_struct cp_down_w; + struct mutex cp_cm_lock; /* protect cp_state & cm */ + wait_queue_head_t cp_waitq; + + unsigned int cp_unacked_packets; + unsigned int cp_unacked_bytes; + unsigned int cp_outgoing:1, + cp_pad_to_32:31; + unsigned int cp_index; +}; + +/* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; __be32 c_laddr; __be32 c_faddr; unsigned int c_loopback:1, - c_outgoing:1, + c_ping_triggered:1, c_pad_to_32:30; + int c_npaths; struct rds_connection *c_passive; + struct rds_transport *c_trans; struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; - struct rds_message *c_xmit_rm; - unsigned long c_xmit_sg; - unsigned int c_xmit_hdr_off; - unsigned int c_xmit_data_off; - unsigned int c_xmit_atomic_sent; - unsigned int c_xmit_rdma_sent; - unsigned int c_xmit_data_sent; - - spinlock_t c_lock; /* protect msg queues */ - u64 c_next_tx_seq; - struct list_head c_send_queue; - struct list_head c_retrans; - - u64 c_next_rx_seq; - - struct rds_transport *c_trans; - void *c_transport_data; - - atomic_t c_state; - unsigned long c_send_gen; - unsigned long c_flags; - unsigned long c_reconnect_jiffies; - struct delayed_work c_send_w; - struct delayed_work c_recv_w; - struct delayed_work c_conn_w; - struct work_struct c_down_w; - struct mutex c_cm_lock; /* protect conn state & cm */ - wait_queue_head_t c_waitq; + /* Protocol version */ + unsigned int c_version; + possible_net_t c_net; struct list_head c_map_item; unsigned long c_map_queued; - unsigned int c_unacked_packets; - unsigned int c_unacked_bytes; - - /* Protocol version */ - unsigned int c_version; - possible_net_t c_net; + struct rds_conn_path c_path[RDS_MPATH_WORKERS]; + wait_queue_head_t c_hs_waitq; /* handshake waitq */ }; static inline @@ -153,6 +170,17 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net) #define RDS_FLAG_RETRANSMITTED 0x04 #define RDS_MAX_ADV_CREDIT 255 +/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping + * probe to exchange control information before establishing a connection. + * Currently the control information that is exchanged is the number of + * supported paths. If the peer is a legacy (older kernel revision) peer, + * it would return a pong message without additional control information + * that would then alert the sender that the peer was an older rev. + */ +#define RDS_FLAG_PROBE_PORT 1 +#define RDS_HS_PROBE(sport, dport) \ + ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \ + (sport == 0 && dport == RDS_FLAG_PROBE_PORT)) /* * Maximum space available for extension headers. */ @@ -212,12 +240,18 @@ struct rds_ext_header_rdma_dest { __be32 h_rdma_offset; }; +/* Extension header announcing number of paths. + * Implicit length = 2 bytes. + */ +#define RDS_EXTHDR_NPATHS 4 + #define __RDS_EXTHDR_MAX 16 /* for now */ struct rds_incoming { atomic_t i_refcount; struct list_head i_item; struct rds_connection *i_conn; + struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; __be32 i_saddr; @@ -433,21 +467,22 @@ struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; struct module *t_owner; - unsigned int t_prefer_loopback:1; + unsigned int t_prefer_loopback:1, + t_mp_capable:1; unsigned int t_type; int (*laddr_check)(struct net *net, __be32 addr); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); - int (*conn_connect)(struct rds_connection *conn); - void (*conn_shutdown)(struct rds_connection *conn); - void (*xmit_prepare)(struct rds_connection *conn); - void (*xmit_complete)(struct rds_connection *conn); + int (*conn_path_connect)(struct rds_conn_path *cp); + void (*conn_path_shutdown)(struct rds_conn_path *conn); + void (*xmit_path_prepare)(struct rds_conn_path *cp); + void (*xmit_path_complete)(struct rds_conn_path *cp); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); - int (*recv)(struct rds_connection *conn); + int (*recv_path)(struct rds_conn_path *cp); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to); void (*inc_free)(struct rds_incoming *inc); @@ -530,6 +565,7 @@ struct rds_sock { /* Socket options - in case there will be more */ unsigned char rs_recverr, rs_cong_monitor; + u32 rs_hash_initval; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) @@ -636,10 +672,12 @@ struct rds_connection *rds_conn_create(struct net *net, struct rds_connection *rds_conn_create_outgoing(struct net *net, __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); -void rds_conn_shutdown(struct rds_connection *conn); +void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); +void rds_conn_path_drop(struct rds_conn_path *cpath); void rds_conn_connect_if_down(struct rds_connection *conn); +void rds_conn_path_connect_if_down(struct rds_conn_path *cp); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, @@ -650,28 +688,60 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...); #define rds_conn_error(conn, fmt...) \ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) +void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); +#define rds_conn_path_error(cp, fmt...) \ + __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) + +static inline int +rds_conn_path_transition(struct rds_conn_path *cp, int old, int new) +{ + return atomic_cmpxchg(&cp->cp_state, old, new) == old; +} + static inline int rds_conn_transition(struct rds_connection *conn, int old, int new) { - return atomic_cmpxchg(&conn->c_state, old, new) == old; + WARN_ON(conn->c_trans->t_mp_capable); + return rds_conn_path_transition(&conn->c_path[0], old, new); +} + +static inline int +rds_conn_path_state(struct rds_conn_path *cp) +{ + return atomic_read(&cp->cp_state); } static inline int rds_conn_state(struct rds_connection *conn) { - return atomic_read(&conn->c_state); + WARN_ON(conn->c_trans->t_mp_capable); + return rds_conn_path_state(&conn->c_path[0]); +} + +static inline int +rds_conn_path_up(struct rds_conn_path *cp) +{ + return atomic_read(&cp->cp_state) == RDS_CONN_UP; } static inline int rds_conn_up(struct rds_connection *conn) { - return atomic_read(&conn->c_state) == RDS_CONN_UP; + WARN_ON(conn->c_trans->t_mp_capable); + return rds_conn_path_up(&conn->c_path[0]); +} + +static inline int +rds_conn_path_connecting(struct rds_conn_path *cp) +{ + return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING; } static inline int rds_conn_connecting(struct rds_connection *conn) { - return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; + WARN_ON(conn->c_trans->t_mp_capable); + return rds_conn_path_connecting(&conn->c_path[0]); } /* message.c */ @@ -720,6 +790,8 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr); +void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, + __be32 saddr); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_incoming *inc, gfp_t gfp); @@ -733,16 +805,16 @@ void rds_inc_info_copy(struct rds_incoming *inc, /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); -void rds_send_reset(struct rds_connection *conn); -int rds_send_xmit(struct rds_connection *conn); +void rds_send_path_reset(struct rds_conn_path *conn); +int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); -int rds_send_pong(struct rds_connection *conn, __be16 dport); -struct rds_message *rds_send_get_message(struct rds_connection *, - struct rm_rdma_op *); +void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, + is_acked_func is_acked); +int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); @@ -809,12 +881,12 @@ extern unsigned int rds_sysctl_trace_level; int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; -void rds_queue_reconnect(struct rds_connection *conn); +void rds_queue_reconnect(struct rds_conn_path *cp); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); -void rds_connect_path_complete(struct rds_connection *conn, int curr); +void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); /* transport.c */ diff --git a/net/rds/rds_single_path.h b/net/rds/rds_single_path.h new file mode 100644 index 000000000..e1241af7c --- /dev/null +++ b/net/rds/rds_single_path.h @@ -0,0 +1,30 @@ +#ifndef _RDS_RDS_SINGLE_H +#define _RDS_RDS_SINGLE_H + +#define c_xmit_rm c_path[0].cp_xmit_rm +#define c_xmit_sg c_path[0].cp_xmit_sg +#define c_xmit_hdr_off c_path[0].cp_xmit_hdr_off +#define c_xmit_data_off c_path[0].cp_xmit_data_off +#define c_xmit_atomic_sent c_path[0].cp_xmit_atomic_sent +#define c_xmit_rdma_sent c_path[0].cp_xmit_rdma_sent +#define c_xmit_data_sent c_path[0].cp_xmit_data_sent +#define c_lock c_path[0].cp_lock +#define c_next_tx_seq c_path[0].cp_next_tx_seq +#define c_send_queue c_path[0].cp_send_queue +#define c_retrans c_path[0].cp_retrans +#define c_next_rx_seq c_path[0].cp_next_rx_seq +#define c_transport_data c_path[0].cp_transport_data +#define c_state c_path[0].cp_state +#define c_send_gen c_path[0].cp_send_gen +#define c_flags c_path[0].cp_flags +#define c_reconnect_jiffies c_path[0].cp_reconnect_jiffies +#define c_send_w c_path[0].cp_send_w +#define c_recv_w c_path[0].cp_recv_w +#define c_conn_w c_path[0].cp_conn_w +#define c_down_w c_path[0].cp_down_w +#define c_cm_lock c_path[0].cp_cm_lock +#define c_waitq c_path[0].cp_waitq +#define c_unacked_packets c_path[0].cp_unacked_packets +#define c_unacked_bytes c_path[0].cp_unacked_bytes + +#endif /* _RDS_RDS_SINGLE_H */ diff --git a/net/rds/recv.c b/net/rds/recv.c index 8413f6c99..cbfabdf3f 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -53,6 +53,20 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, } EXPORT_SYMBOL_GPL(rds_inc_init); +void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, + __be32 saddr) +{ + atomic_set(&inc->i_refcount, 1); + INIT_LIST_HEAD(&inc->i_item); + inc->i_conn = cp->cp_conn; + inc->i_conn_path = cp; + inc->i_saddr = saddr; + inc->i_rdma_cookie = 0; + inc->i_rx_tstamp.tv_sec = 0; + inc->i_rx_tstamp.tv_usec = 0; +} +EXPORT_SYMBOL_GPL(rds_inc_path_init); + static void rds_inc_addref(struct rds_incoming *inc) { rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); @@ -142,6 +156,67 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock } } +static void rds_recv_hs_exthdrs(struct rds_header *hdr, + struct rds_connection *conn) +{ + unsigned int pos = 0, type, len; + union { + struct rds_ext_header_version version; + u16 rds_npaths; + } buffer; + + while (1) { + len = sizeof(buffer); + type = rds_message_next_extension(hdr, &pos, &buffer, &len); + if (type == RDS_EXTHDR_NONE) + break; + /* Process extension header here */ + switch (type) { + case RDS_EXTHDR_NPATHS: + conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, + buffer.rds_npaths); + break; + default: + pr_warn_ratelimited("ignoring unknown exthdr type " + "0x%x\n", type); + } + } + /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ + conn->c_npaths = max_t(int, conn->c_npaths, 1); +} + +/* rds_start_mprds() will synchronously start multiple paths when appropriate. + * The scheme is based on the following rules: + * + * 1. rds_sendmsg on first connect attempt sends the probe ping, with the + * sender's npaths (s_npaths) + * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It + * sends back a probe-pong with r_npaths. After that, if rcvr is the + * smaller ip addr, it starts rds_conn_path_connect_if_down on all + * mprds_paths. + * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down. + * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be + * called after reception of the probe-pong on all mprds_paths. + * Otherwise (sender of probe-ping is not the smaller ip addr): just call + * rds_conn_path_connect_if_down on the hashed path. (see rule 4) + * 4. when cp_index > 0, rds_connect_worker must only trigger + * a connection if laddr < faddr. + * 5. sender may end up queuing the packet on the cp. will get sent out later. + * when connection is completed. + */ +static void rds_start_mprds(struct rds_connection *conn) +{ + int i; + struct rds_conn_path *cp; + + if (conn->c_npaths > 1 && conn->c_laddr < conn->c_faddr) { + for (i = 1; i < conn->c_npaths; i++) { + cp = &conn->c_path[i]; + rds_conn_path_connect_if_down(cp); + } + } +} + /* * The transport must make sure that this is serialized against other * rx and conn reset on this specific conn. @@ -164,13 +239,18 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_sock *rs = NULL; struct sock *sk; unsigned long flags; + struct rds_conn_path *cp; inc->i_conn = conn; inc->i_rx_jiffies = jiffies; + if (conn->c_trans->t_mp_capable) + cp = inc->i_conn_path; + else + cp = &conn->c_path[0]; rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " "flags 0x%x rx_jiffies %lu\n", conn, - (unsigned long long)conn->c_next_rx_seq, + (unsigned long long)cp->cp_next_rx_seq, inc, (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), be32_to_cpu(inc->i_hdr.h_len), @@ -199,16 +279,34 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption. */ - if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && + if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { rds_stats_inc(s_recv_drop_old_seq); goto out; } - conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; + cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { + if (inc->i_hdr.h_sport == 0) { + rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); + goto out; + } rds_stats_inc(s_recv_ping); - rds_send_pong(conn, inc->i_hdr.h_sport); + rds_send_pong(cp, inc->i_hdr.h_sport); + /* if this is a handshake ping, start multipath if necessary */ + if (RDS_HS_PROBE(inc->i_hdr.h_sport, inc->i_hdr.h_dport)) { + rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); + rds_start_mprds(cp->cp_conn); + } + goto out; + } + + if (inc->i_hdr.h_dport == RDS_FLAG_PROBE_PORT && + inc->i_hdr.h_sport == 0) { + rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); + /* if this is a handshake pong, start multipath if necessary */ + rds_start_mprds(cp->cp_conn); + wake_up(&cp->cp_conn->c_hs_waitq); goto out; } diff --git a/net/rds/send.c b/net/rds/send.c index b1962f8e3..896626b9a 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -62,14 +62,14 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status); * Reset the send state. Callers must ensure that this doesn't race with * rds_send_xmit(). */ -void rds_send_reset(struct rds_connection *conn) +void rds_send_path_reset(struct rds_conn_path *cp) { struct rds_message *rm, *tmp; unsigned long flags; - if (conn->c_xmit_rm) { - rm = conn->c_xmit_rm; - conn->c_xmit_rm = NULL; + if (cp->cp_xmit_rm) { + rm = cp->cp_xmit_rm; + cp->cp_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's @@ -78,37 +78,37 @@ void rds_send_reset(struct rds_connection *conn) rds_message_put(rm); } - conn->c_xmit_sg = 0; - conn->c_xmit_hdr_off = 0; - conn->c_xmit_data_off = 0; - conn->c_xmit_atomic_sent = 0; - conn->c_xmit_rdma_sent = 0; - conn->c_xmit_data_sent = 0; + cp->cp_xmit_sg = 0; + cp->cp_xmit_hdr_off = 0; + cp->cp_xmit_data_off = 0; + cp->cp_xmit_atomic_sent = 0; + cp->cp_xmit_rdma_sent = 0; + cp->cp_xmit_data_sent = 0; - conn->c_map_queued = 0; + cp->cp_conn->c_map_queued = 0; - conn->c_unacked_packets = rds_sysctl_max_unacked_packets; - conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; + cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; /* Mark messages as retransmissions, and move them to the send q */ - spin_lock_irqsave(&conn->c_lock, flags); - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + spin_lock_irqsave(&cp->cp_lock, flags); + list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); } - list_splice_init(&conn->c_retrans, &conn->c_send_queue); - spin_unlock_irqrestore(&conn->c_lock, flags); + list_splice_init(&cp->cp_retrans, &cp->cp_send_queue); + spin_unlock_irqrestore(&cp->cp_lock, flags); } -EXPORT_SYMBOL_GPL(rds_send_reset); +EXPORT_SYMBOL_GPL(rds_send_path_reset); -static int acquire_in_xmit(struct rds_connection *conn) +static int acquire_in_xmit(struct rds_conn_path *cp) { - return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; + return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0; } -static void release_in_xmit(struct rds_connection *conn) +static void release_in_xmit(struct rds_conn_path *cp) { - clear_bit(RDS_IN_XMIT, &conn->c_flags); + clear_bit(RDS_IN_XMIT, &cp->cp_flags); smp_mb__after_atomic(); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a @@ -116,8 +116,8 @@ static void release_in_xmit(struct rds_connection *conn) * the system-wide hashed waitqueue buckets in the fast path only to * almost never find waiters. */ - if (waitqueue_active(&conn->c_waitq)) - wake_up_all(&conn->c_waitq); + if (waitqueue_active(&cp->cp_waitq)) + wake_up_all(&cp->cp_waitq); } /* @@ -134,8 +134,9 @@ static void release_in_xmit(struct rds_connection *conn) * - small message latency is higher behind queued large messages * - large message latency isn't starved by intervening small sends */ -int rds_send_xmit(struct rds_connection *conn) +int rds_send_xmit(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_message *rm; unsigned long flags; unsigned int tmp; @@ -155,7 +156,7 @@ restart: * avoids blocking the caller and trading per-connection data between * caches per message. */ - if (!acquire_in_xmit(conn)) { + if (!acquire_in_xmit(cp)) { rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; @@ -169,21 +170,21 @@ restart: * The acquire_in_xmit() check above ensures that only one * caller can increment c_send_gen at any time. */ - conn->c_send_gen++; - send_gen = conn->c_send_gen; + cp->cp_send_gen++; + send_gen = cp->cp_send_gen; /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. */ - if (!rds_conn_up(conn)) { - release_in_xmit(conn); + if (!rds_conn_path_up(cp)) { + release_in_xmit(cp); ret = 0; goto out; } - if (conn->c_trans->xmit_prepare) - conn->c_trans->xmit_prepare(conn); + if (conn->c_trans->xmit_path_prepare) + conn->c_trans->xmit_path_prepare(cp); /* * spin trying to push headers and data down the connection until @@ -191,7 +192,7 @@ restart: */ while (1) { - rm = conn->c_xmit_rm; + rm = cp->cp_xmit_rm; /* * If between sending messages, we can send a pending congestion @@ -204,14 +205,16 @@ restart: break; } rm->data.op_active = 1; + rm->m_inc.i_conn_path = cp; + rm->m_inc.i_conn = cp->cp_conn; - conn->c_xmit_rm = rm; + cp->cp_xmit_rm = rm; } /* * If not already working on one, grab the next message. * - * c_xmit_rm holds a ref while we're sending this message down + * cp_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ @@ -228,10 +231,10 @@ restart: if (batch_count >= send_batch_count) goto over_batch; - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&cp->cp_lock, flags); - if (!list_empty(&conn->c_send_queue)) { - rm = list_entry(conn->c_send_queue.next, + if (!list_empty(&cp->cp_send_queue)) { + rm = list_entry(cp->cp_send_queue.next, struct rds_message, m_conn_item); rds_message_addref(rm); @@ -240,10 +243,11 @@ restart: * Move the message from the send queue to the retransmit * list right away. */ - list_move_tail(&rm->m_conn_item, &conn->c_retrans); + list_move_tail(&rm->m_conn_item, + &cp->cp_retrans); } - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); if (!rm) break; @@ -257,32 +261,34 @@ restart: */ if (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&cp->cp_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } /* Require an ACK every once in a while */ len = ntohl(rm->m_inc.i_hdr.h_len); - if (conn->c_unacked_packets == 0 || - conn->c_unacked_bytes < len) { + if (cp->cp_unacked_packets == 0 || + cp->cp_unacked_bytes < len) { __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); - conn->c_unacked_packets = rds_sysctl_max_unacked_packets; - conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + cp->cp_unacked_packets = + rds_sysctl_max_unacked_packets; + cp->cp_unacked_bytes = + rds_sysctl_max_unacked_bytes; rds_stats_inc(s_send_ack_required); } else { - conn->c_unacked_bytes -= len; - conn->c_unacked_packets--; + cp->cp_unacked_bytes -= len; + cp->cp_unacked_packets--; } - conn->c_xmit_rm = rm; + cp->cp_xmit_rm = rm; } /* The transport either sends the whole rdma or none of it */ - if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue @@ -294,11 +300,11 @@ restart: wake_up_interruptible(&rm->m_flush_wait); break; } - conn->c_xmit_rdma_sent = 1; + cp->cp_xmit_rdma_sent = 1; } - if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue @@ -310,7 +316,7 @@ restart: wake_up_interruptible(&rm->m_flush_wait); break; } - conn->c_xmit_atomic_sent = 1; + cp->cp_xmit_atomic_sent = 1; } @@ -336,41 +342,42 @@ restart: rm->data.op_active = 0; } - if (rm->data.op_active && !conn->c_xmit_data_sent) { + if (rm->data.op_active && !cp->cp_xmit_data_sent) { rm->m_final_op = &rm->data; + ret = conn->c_trans->xmit(conn, rm, - conn->c_xmit_hdr_off, - conn->c_xmit_sg, - conn->c_xmit_data_off); + cp->cp_xmit_hdr_off, + cp->cp_xmit_sg, + cp->cp_xmit_data_off); if (ret <= 0) break; - if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) { + if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) { tmp = min_t(int, ret, sizeof(struct rds_header) - - conn->c_xmit_hdr_off); - conn->c_xmit_hdr_off += tmp; + cp->cp_xmit_hdr_off); + cp->cp_xmit_hdr_off += tmp; ret -= tmp; } - sg = &rm->data.op_sg[conn->c_xmit_sg]; + sg = &rm->data.op_sg[cp->cp_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - - conn->c_xmit_data_off); - conn->c_xmit_data_off += tmp; + cp->cp_xmit_data_off); + cp->cp_xmit_data_off += tmp; ret -= tmp; - if (conn->c_xmit_data_off == sg->length) { - conn->c_xmit_data_off = 0; + if (cp->cp_xmit_data_off == sg->length) { + cp->cp_xmit_data_off = 0; sg++; - conn->c_xmit_sg++; - BUG_ON(ret != 0 && - conn->c_xmit_sg == rm->data.op_nents); + cp->cp_xmit_sg++; + BUG_ON(ret != 0 && cp->cp_xmit_sg == + rm->data.op_nents); } } - if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && - (conn->c_xmit_sg == rm->data.op_nents)) - conn->c_xmit_data_sent = 1; + if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) && + (cp->cp_xmit_sg == rm->data.op_nents)) + cp->cp_xmit_data_sent = 1; } /* @@ -378,23 +385,23 @@ restart: * if there is a data op. Thus, if the data is sent (or there was * none), then we're done with the rm. */ - if (!rm->data.op_active || conn->c_xmit_data_sent) { - conn->c_xmit_rm = NULL; - conn->c_xmit_sg = 0; - conn->c_xmit_hdr_off = 0; - conn->c_xmit_data_off = 0; - conn->c_xmit_rdma_sent = 0; - conn->c_xmit_atomic_sent = 0; - conn->c_xmit_data_sent = 0; + if (!rm->data.op_active || cp->cp_xmit_data_sent) { + cp->cp_xmit_rm = NULL; + cp->cp_xmit_sg = 0; + cp->cp_xmit_hdr_off = 0; + cp->cp_xmit_data_off = 0; + cp->cp_xmit_rdma_sent = 0; + cp->cp_xmit_atomic_sent = 0; + cp->cp_xmit_data_sent = 0; rds_message_put(rm); } } over_batch: - if (conn->c_trans->xmit_complete) - conn->c_trans->xmit_complete(conn); - release_in_xmit(conn); + if (conn->c_trans->xmit_path_complete) + conn->c_trans->xmit_path_complete(cp); + release_in_xmit(cp); /* Nuke any messages we decided not to retransmit. */ if (!list_empty(&to_be_dropped)) { @@ -422,12 +429,12 @@ over_batch: if (ret == 0) { smp_mb(); if ((test_bit(0, &conn->c_map_queued) || - !list_empty(&conn->c_send_queue)) && - send_gen == conn->c_send_gen) { + !list_empty(&cp->cp_send_queue)) && + send_gen == cp->cp_send_gen) { rds_stats_inc(s_send_lock_queue_raced); if (batch_count < send_batch_count) goto restart; - queue_delayed_work(rds_wq, &conn->c_send_w, 1); + queue_delayed_work(rds_wq, &cp->cp_send_w, 1); } } out: @@ -559,42 +566,6 @@ __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) /* No need to wake the app - caller does this */ } -/* - * This is called from the IB send completion when we detect - * a RDMA operation that failed with remote access error. - * So speed is not an issue here. - */ -struct rds_message *rds_send_get_message(struct rds_connection *conn, - struct rm_rdma_op *op) -{ - struct rds_message *rm, *tmp, *found = NULL; - unsigned long flags; - - spin_lock_irqsave(&conn->c_lock, flags); - - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (&rm->rdma == op) { - atomic_inc(&rm->m_refcount); - found = rm; - goto out; - } - } - - list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (&rm->rdma == op) { - atomic_inc(&rm->m_refcount); - found = rm; - break; - } - } - -out: - spin_unlock_irqrestore(&conn->c_lock, flags); - - return found; -} -EXPORT_SYMBOL_GPL(rds_send_get_message); - /* * This removes messages from the socket's list if they're on it. The list * argument must be private to the caller, we must be able to modify it @@ -685,16 +656,16 @@ unlock_and_drop: * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked * checks the RDS_MSG_HAS_ACK_SEQ bit. */ -void rds_send_drop_acked(struct rds_connection *conn, u64 ack, - is_acked_func is_acked) +void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, + is_acked_func is_acked) { struct rds_message *rm, *tmp; unsigned long flags; LIST_HEAD(list); - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&cp->cp_lock, flags); - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { if (!rds_send_is_acked(rm, ack, is_acked)) break; @@ -706,17 +677,26 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, if (!list_empty(&list)) smp_mb__after_atomic(); - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); /* now remove the messages from the sock list as needed */ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); } +EXPORT_SYMBOL_GPL(rds_send_path_drop_acked); + +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked) +{ + WARN_ON(conn->c_trans->t_mp_capable); + rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked); +} EXPORT_SYMBOL_GPL(rds_send_drop_acked); void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; + struct rds_conn_path *cp; unsigned long flags; LIST_HEAD(list); @@ -745,22 +725,26 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) list_for_each_entry(rm, &list, m_sock_item) { conn = rm->m_inc.i_conn; + if (conn->c_trans->t_mp_capable) + cp = rm->m_inc.i_conn_path; + else + cp = &conn->c_path[0]; - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&cp->cp_lock, flags); /* * Maybe someone else beat us to removing rm from the conn. * If we race with their flag update we'll get the lock and * then really see that the flag has been cleared. */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); spin_lock_irqsave(&rm->m_rs_lock, flags); rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); continue; } list_del_init(&rm->m_conn_item); - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); /* * Couldn't grab m_rs_lock in top loop (lock ordering), @@ -809,6 +793,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) * message from the flow with RDS_CANCEL_SENT_TO. */ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, + struct rds_conn_path *cp, struct rds_message *rm, __be16 sport, __be16 dport, int *queued) { @@ -852,13 +837,14 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, trying to minimize the time we hold c_lock */ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); rm->m_inc.i_conn = conn; + rm->m_inc.i_conn_path = cp; rds_message_addref(rm); - spin_lock(&conn->c_lock); - rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++); - list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + spin_lock(&cp->cp_lock); + rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++); + list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); - spin_unlock(&conn->c_lock); + spin_unlock(&cp->cp_lock); rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", rm, len, rs, rs->rs_snd_bytes, @@ -977,6 +963,29 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, return ret; } +static void rds_send_ping(struct rds_connection *conn); + +static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) +{ + int hash; + + if (conn->c_npaths == 0) + hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); + else + hash = RDS_MPATH_HASH(rs, conn->c_npaths); + if (conn->c_npaths == 0 && hash != 0) { + rds_send_ping(conn); + + if (conn->c_npaths == 0) { + wait_event_interruptible(conn->c_hs_waitq, + (conn->c_npaths != 0)); + } + if (conn->c_npaths == 1) + hash = 0; + } + return hash; +} + int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; @@ -990,6 +999,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); + struct rds_conn_path *cpath; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1088,15 +1098,19 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } - rds_conn_connect_if_down(conn); + if (conn->c_trans->t_mp_capable) + cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)]; + else + cpath = &conn->c_path[0]; + + rds_conn_path_connect_if_down(cpath); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { rs->rs_seen_congestion = 1; goto out; } - - while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, + while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); @@ -1106,7 +1120,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), - rds_send_queue_rm(rs, conn, rm, + rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued), @@ -1127,9 +1141,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) */ rds_stats_inc(s_send_queued); - ret = rds_send_xmit(conn); + ret = rds_send_xmit(cpath); if (ret == -ENOMEM || ret == -EAGAIN) - queue_delayed_work(rds_wq, &conn->c_send_w, 1); + queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); rds_message_put(rm); return payload_len; @@ -1147,10 +1161,16 @@ out: } /* - * Reply to a ping packet. + * send out a probe. Can be shared by rds_send_ping, + * rds_send_pong, rds_send_hb. + * rds_send_hb should use h_flags + * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED + * or + * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ int -rds_send_pong(struct rds_connection *conn, __be16 dport) +rds_send_probe(struct rds_conn_path *cp, __be16 sport, + __be16 dport, u8 h_flags) { struct rds_message *rm; unsigned long flags; @@ -1162,31 +1182,41 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) goto out; } - rm->m_daddr = conn->c_faddr; + rm->m_daddr = cp->cp_conn->c_faddr; rm->data.op_active = 1; - rds_conn_connect_if_down(conn); + rds_conn_path_connect_if_down(cp); - ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); + ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL); if (ret) goto out; - spin_lock_irqsave(&conn->c_lock, flags); - list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + spin_lock_irqsave(&cp->cp_lock, flags); + list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); rds_message_addref(rm); - rm->m_inc.i_conn = conn; + rm->m_inc.i_conn = cp->cp_conn; + rm->m_inc.i_conn_path = cp; + + rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, + cp->cp_next_tx_seq); + rm->m_inc.i_hdr.h_flags |= h_flags; + cp->cp_next_tx_seq++; + + if (RDS_HS_PROBE(sport, dport) && cp->cp_conn->c_trans->t_mp_capable) { + u16 npaths = RDS_MPATH_WORKERS; - rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport, - conn->c_next_tx_seq); - conn->c_next_tx_seq++; - spin_unlock_irqrestore(&conn->c_lock, flags); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_NPATHS, &npaths, + sizeof(npaths)); + } + spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); /* schedule the send work on rds_wq */ - queue_delayed_work(rds_wq, &conn->c_send_w, 1); + queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rds_message_put(rm); return 0; @@ -1196,3 +1226,25 @@ out: rds_message_put(rm); return ret; } + +int +rds_send_pong(struct rds_conn_path *cp, __be16 dport) +{ + return rds_send_probe(cp, 0, dport, 0); +} + +void +rds_send_ping(struct rds_connection *conn) +{ + unsigned long flags; + struct rds_conn_path *cp = &conn->c_path[0]; + + spin_lock_irqsave(&cp->cp_lock, flags); + if (conn->c_ping_triggered) { + spin_unlock_irqrestore(&cp->cp_lock, flags); + return; + } + conn->c_ping_triggered = 1; + spin_unlock_irqrestore(&cp->cp_lock, flags); + rds_send_probe(&conn->c_path[0], RDS_FLAG_PROBE_PORT, 0, 0); +} diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c8a7b4c90..fcddacc92 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -56,8 +56,8 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *fpos); -int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; -int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; +static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; +static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; static struct ctl_table rds_tcp_sysctl_table[] = { #define RDS_TCP_SNDBUF 0 @@ -135,9 +135,9 @@ void rds_tcp_restore_callbacks(struct socket *sock, * from being called while it isn't set. */ void rds_tcp_reset_callbacks(struct socket *sock, - struct rds_connection *conn) + struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *osock = tc->t_sock; if (!osock) @@ -147,8 +147,8 @@ void rds_tcp_reset_callbacks(struct socket *sock, * We have an outstanding SYN to this peer, which may * potentially have transitioned to the RDS_CONN_UP state, * so we must quiesce any send threads before resetting - * c_transport_data. We quiesce these threads by setting - * c_state to something other than RDS_CONN_UP, and then + * cp_transport_data. We quiesce these threads by setting + * cp_state to something other than RDS_CONN_UP, and then * waiting for any existing threads in rds_send_xmit to * complete release_in_xmit(). (Subsequent threads entering * rds_send_xmit() will bail on !rds_conn_up(). @@ -163,38 +163,25 @@ void rds_tcp_reset_callbacks(struct socket *sock, * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change * cannot mark rds_conn_path_up() in the window before lock_sock() */ - atomic_set(&conn->c_state, RDS_CONN_RESETTING); - wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); + atomic_set(&cp->cp_state, RDS_CONN_RESETTING); + wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); lock_sock(osock->sk); /* reset receive side state for rds_tcp_data_recv() for osock */ + cancel_delayed_work_sync(&cp->cp_send_w); + cancel_delayed_work_sync(&cp->cp_recv_w); if (tc->t_tinc) { rds_inc_put(&tc->t_tinc->ti_inc); tc->t_tinc = NULL; } tc->t_tinc_hdr_rem = sizeof(struct rds_header); tc->t_tinc_data_rem = 0; - tc->t_sock = NULL; - - write_lock_bh(&osock->sk->sk_callback_lock); - - osock->sk->sk_user_data = NULL; - osock->sk->sk_data_ready = tc->t_orig_data_ready; - osock->sk->sk_write_space = tc->t_orig_write_space; - osock->sk->sk_state_change = tc->t_orig_state_change; - write_unlock_bh(&osock->sk->sk_callback_lock); + rds_tcp_restore_callbacks(osock, tc); release_sock(osock->sk); sock_release(osock); newsock: - rds_send_reset(conn); + rds_send_path_reset(cp); lock_sock(sock->sk); - write_lock_bh(&sock->sk->sk_callback_lock); - tc->t_sock = sock; - sock->sk->sk_user_data = conn; - sock->sk->sk_data_ready = rds_tcp_data_ready; - sock->sk->sk_write_space = rds_tcp_write_space; - sock->sk->sk_state_change = rds_tcp_state_change; - - write_unlock_bh(&sock->sk->sk_callback_lock); + rds_tcp_set_callbacks(sock, cp); release_sock(sock->sk); } @@ -202,9 +189,9 @@ newsock: * above rds_tcp_reset_callbacks for notes about synchronization * with data path */ -void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) +void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); write_lock_bh(&sock->sk->sk_callback_lock); @@ -220,12 +207,12 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) sock->sk->sk_data_ready = sock->sk->sk_user_data; tc->t_sock = sock; - tc->conn = conn; + tc->t_cpath = cp; tc->t_orig_data_ready = sock->sk->sk_data_ready; tc->t_orig_write_space = sock->sk->sk_write_space; tc->t_orig_state_change = sock->sk->sk_state_change; - sock->sk->sk_user_data = conn; + sock->sk->sk_user_data = cp; sock->sk->sk_data_ready = rds_tcp_data_ready; sock->sk->sk_write_space = rds_tcp_write_space; sock->sk->sk_state_change = rds_tcp_state_change; @@ -283,24 +270,29 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr) static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_tcp_connection *tc; + int i; - tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); - if (!tc) - return -ENOMEM; + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); + if (!tc) + return -ENOMEM; - mutex_init(&tc->t_conn_lock); - tc->t_sock = NULL; - tc->t_tinc = NULL; - tc->t_tinc_hdr_rem = sizeof(struct rds_header); - tc->t_tinc_data_rem = 0; + mutex_init(&tc->t_conn_path_lock); + tc->t_sock = NULL; + tc->t_tinc = NULL; + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; - conn->c_transport_data = tc; + conn->c_path[i].cp_transport_data = tc; + tc->t_cpath = &conn->c_path[i]; - spin_lock_irq(&rds_tcp_conn_lock); - list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); - spin_unlock_irq(&rds_tcp_conn_lock); + spin_lock_irq(&rds_tcp_conn_lock); + list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); + spin_unlock_irq(&rds_tcp_conn_lock); + rdsdebug("rds_conn_path [%d] tc %p\n", i, + conn->c_path[i].cp_transport_data); + } - rdsdebug("alloced tc %p\n", conn->c_transport_data); return 0; } @@ -317,6 +309,17 @@ static void rds_tcp_conn_free(void *arg) kmem_cache_free(rds_tcp_conn_slab, tc); } +static bool list_has_conn(struct list_head *list, struct rds_connection *conn) +{ + struct rds_tcp_connection *tc, *_tc; + + list_for_each_entry_safe(tc, _tc, list, t_tcp_node) { + if (tc->t_cpath->cp_conn == conn) + return true; + } + return false; +} + static void rds_tcp_destroy_conns(void) { struct rds_tcp_connection *tc, *_tc; @@ -324,29 +327,28 @@ static void rds_tcp_destroy_conns(void) /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&rds_tcp_conn_lock); - list_splice(&rds_tcp_conn_list, &tmp_list); - INIT_LIST_HEAD(&rds_tcp_conn_list); + list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { + if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) + list_move_tail(&tc->t_tcp_node, &tmp_list); + } spin_unlock_irq(&rds_tcp_conn_lock); - list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { - if (tc->conn->c_passive) - rds_conn_destroy(tc->conn->c_passive); - rds_conn_destroy(tc->conn); - } + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) + rds_conn_destroy(tc->t_cpath->cp_conn); } static void rds_tcp_exit(void); struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, - .xmit_prepare = rds_tcp_xmit_prepare, - .xmit_complete = rds_tcp_xmit_complete, + .xmit_path_prepare = rds_tcp_xmit_path_prepare, + .xmit_path_complete = rds_tcp_xmit_path_complete, .xmit = rds_tcp_xmit, - .recv = rds_tcp_recv, + .recv_path = rds_tcp_recv_path, .conn_alloc = rds_tcp_conn_alloc, .conn_free = rds_tcp_conn_free, - .conn_connect = rds_tcp_conn_connect, - .conn_shutdown = rds_tcp_conn_shutdown, + .conn_path_connect = rds_tcp_conn_path_connect, + .conn_path_shutdown = rds_tcp_conn_path_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, .inc_free = rds_tcp_inc_free, .stats_info_copy = rds_tcp_stats_info_copy, @@ -355,6 +357,7 @@ struct rds_transport rds_tcp_transport = { .t_name = "tcp", .t_type = RDS_TRANS_TCP, .t_prefer_loopback = 1, + .t_mp_capable = 1, }; static int rds_tcp_netid; @@ -488,10 +491,30 @@ static struct pernet_operations rds_tcp_net_ops = { .size = sizeof(struct rds_tcp_net), }; +/* explicitly send a RST on each socket, thereby releasing any socket refcnts + * that may otherwise hold up netns deletion. + */ +static void rds_tcp_conn_paths_destroy(struct rds_connection *conn) +{ + struct rds_conn_path *cp; + struct rds_tcp_connection *tc; + int i; + struct sock *sk; + + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + cp = &conn->c_path[i]; + tc = cp->cp_transport_data; + if (!tc->t_sock) + continue; + sk = tc->t_sock->sk; + sk->sk_prot->disconnect(sk, 0); + tcp_done(sk); + } +} + static void rds_tcp_kill_sock(struct net *net) { struct rds_tcp_connection *tc, *_tc; - struct sock *sk; LIST_HEAD(tmp_list); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); @@ -500,23 +523,27 @@ static void rds_tcp_kill_sock(struct net *net) flush_work(&rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = read_pnet(&tc->conn->c_net); + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; - list_move_tail(&tc->t_tcp_node, &tmp_list); + if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) + list_move_tail(&tc->t_tcp_node, &tmp_list); } spin_unlock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { - sk = tc->t_sock->sk; - sk->sk_prot->disconnect(sk, 0); - tcp_done(sk); - if (tc->conn->c_passive) - rds_conn_destroy(tc->conn->c_passive); - rds_conn_destroy(tc->conn); + rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn); + rds_conn_destroy(tc->t_cpath->cp_conn); } } +void *rds_tcp_listen_sock_def_readable(struct net *net) +{ + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + return rtn->rds_tcp_listen_sock->sk->sk_user_data; +} + static int rds_tcp_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -551,12 +578,13 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = read_pnet(&tc->conn->c_net); + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; - rds_conn_drop(tc->conn); /* reconnect with new parameters */ + /* reconnect with new parameters */ + rds_conn_path_drop(tc->t_cpath); } spin_unlock_irq(&rds_tcp_conn_lock); } diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 7940babf6..9a1cc8906 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -11,11 +11,11 @@ struct rds_tcp_incoming { struct rds_tcp_connection { struct list_head t_tcp_node; - struct rds_connection *conn; - /* t_conn_lock synchronizes the connection establishment between - * rds_tcp_accept_one and rds_tcp_conn_connect + struct rds_conn_path *t_cpath; + /* t_conn_path_lock synchronizes the connection establishment between + * rds_tcp_accept_one and rds_tcp_conn_path_connect */ - struct mutex t_conn_lock; + struct mutex t_conn_path_lock; struct socket *t_sock; void *t_orig_write_space; void *t_orig_data_ready; @@ -49,8 +49,8 @@ struct rds_tcp_statistics { /* tcp.c */ void rds_tcp_tune(struct socket *sock); void rds_tcp_nonagle(struct socket *sock); -void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); -void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn); +void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); +void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, struct rds_tcp_connection *tc); u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); @@ -60,8 +60,8 @@ extern struct rds_transport rds_tcp_transport; void rds_tcp_accept_work(struct sock *sk); /* tcp_connect.c */ -int rds_tcp_conn_connect(struct rds_connection *conn); -void rds_tcp_conn_shutdown(struct rds_connection *conn); +int rds_tcp_conn_path_connect(struct rds_conn_path *cp); +void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ @@ -70,18 +70,19 @@ void rds_tcp_listen_stop(struct socket *); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); int rds_tcp_keepalive(struct socket *sock); +void *rds_tcp_listen_sock_def_readable(struct net *net); /* tcp_recv.c */ int rds_tcp_recv_init(void); void rds_tcp_recv_exit(void); void rds_tcp_data_ready(struct sock *sk); -int rds_tcp_recv(struct rds_connection *conn); +int rds_tcp_recv_path(struct rds_conn_path *cp); void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); /* tcp_send.c */ -void rds_tcp_xmit_prepare(struct rds_connection *conn); -void rds_tcp_xmit_complete(struct rds_connection *conn); +void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp); +void rds_tcp_xmit_path_complete(struct rds_conn_path *cp); int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_tcp_write_space(struct sock *sk); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f6e95d60d..05f61c533 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -40,16 +40,16 @@ void rds_tcp_state_change(struct sock *sk) { void (*state_change)(struct sock *sk); - struct rds_connection *conn; + struct rds_conn_path *cp; struct rds_tcp_connection *tc; read_lock_bh(&sk->sk_callback_lock); - conn = sk->sk_user_data; - if (!conn) { + cp = sk->sk_user_data; + if (!cp) { state_change = sk->sk_state_change; goto out; } - tc = conn->c_transport_data; + tc = cp->cp_transport_data; state_change = tc->t_orig_state_change; rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); @@ -60,11 +60,11 @@ void rds_tcp_state_change(struct sock *sk) case TCP_SYN_RECV: break; case TCP_ESTABLISHED: - rds_connect_path_complete(conn, RDS_CONN_CONNECTING); + rds_connect_path_complete(cp, RDS_CONN_CONNECTING); break; case TCP_CLOSE_WAIT: case TCP_CLOSE: - rds_conn_drop(conn); + rds_conn_path_drop(cp); default: break; } @@ -73,17 +73,24 @@ out: state_change(sk); } -int rds_tcp_conn_connect(struct rds_connection *conn) +int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; struct sockaddr_in src, dest; int ret; - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_connection *conn = cp->cp_conn; + struct rds_tcp_connection *tc = cp->cp_transport_data; - mutex_lock(&tc->t_conn_lock); + /* for multipath rds,we only trigger the connection after + * the handshake probe has determined the number of paths. + */ + if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2) + return -EAGAIN; + + mutex_lock(&tc->t_conn_path_lock); - if (rds_conn_up(conn)) { - mutex_unlock(&tc->t_conn_lock); + if (rds_conn_path_up(cp)) { + mutex_unlock(&tc->t_conn_path_lock); return 0; } ret = sock_create_kern(rds_conn_net(conn), PF_INET, @@ -112,10 +119,11 @@ int rds_tcp_conn_connect(struct rds_connection *conn) * once we call connect() we can start getting callbacks and they * own the socket */ - rds_tcp_set_callbacks(sock, conn); + rds_tcp_set_callbacks(sock, cp); ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), O_NONBLOCK); + cp->cp_outgoing = 1; rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; @@ -123,11 +131,11 @@ int rds_tcp_conn_connect(struct rds_connection *conn) rds_tcp_keepalive(sock); sock = NULL; } else { - rds_tcp_restore_callbacks(sock, conn->c_transport_data); + rds_tcp_restore_callbacks(sock, cp->cp_transport_data); } out: - mutex_unlock(&tc->t_conn_lock); + mutex_unlock(&tc->t_conn_path_lock); if (sock) sock_release(sock); return ret; @@ -142,12 +150,13 @@ out: * callbacks to those set by TCP. Our callbacks won't execute again once we * hold the sock lock. */ -void rds_tcp_conn_shutdown(struct rds_connection *conn) +void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; - rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock); + rdsdebug("shutting down conn %p tc %p sock %p\n", + cp->cp_conn, tc, sock); if (sock) { sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 245542ca4..e0b23fb5b 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -70,6 +70,52 @@ bail: return ret; } +/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the + * client's ipaddr < server's ipaddr. Otherwise, close the accepted + * socket and force a reconneect from smaller -> larger ip addr. The reason + * we special case cp_index 0 is to allow the rds probe ping itself to itself + * get through efficiently. + * Since reconnects are only initiated from the node with the numerically + * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side + * by moving them to CONNECTING in this function. + */ +struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) +{ + int i; + bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); + int npaths = conn->c_npaths; + + if (npaths <= 1) { + struct rds_conn_path *cp = &conn->c_path[0]; + int ret; + + ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, + RDS_CONN_CONNECTING); + if (!ret) + rds_conn_path_transition(cp, RDS_CONN_ERROR, + RDS_CONN_CONNECTING); + return cp->cp_transport_data; + } + + /* for mprds, paths with cp_index > 0 MUST be initiated by the peer + * with the smaller address. + */ + if (!peer_is_smaller) + return NULL; + + for (i = 1; i < npaths; i++) { + struct rds_conn_path *cp = &conn->c_path[i]; + + if (rds_conn_path_transition(cp, RDS_CONN_DOWN, + RDS_CONN_CONNECTING) || + rds_conn_path_transition(cp, RDS_CONN_ERROR, + RDS_CONN_CONNECTING)) { + return cp->cp_transport_data; + } + } + return NULL; +} + int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; @@ -78,6 +124,7 @@ int rds_tcp_accept_one(struct socket *sock) struct inet_sock *inet; struct rds_tcp_connection *rs_tcp = NULL; int conn_state; + struct rds_conn_path *cp; if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; @@ -118,11 +165,14 @@ int rds_tcp_accept_one(struct socket *sock) * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ - rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); - mutex_lock(&rs_tcp->t_conn_lock); - conn_state = rds_conn_state(conn); - if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP) + rs_tcp = rds_tcp_accept_one_path(conn); + if (!rs_tcp) + goto rst_nsk; + mutex_lock(&rs_tcp->t_conn_path_lock); + cp = rs_tcp->t_cpath; + conn_state = rds_conn_path_state(cp); + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP && + conn_state != RDS_CONN_ERROR) goto rst_nsk; if (rs_tcp->t_sock) { /* Need to resolve a duelling SYN between peers. @@ -132,17 +182,17 @@ int rds_tcp_accept_one(struct socket *sock) * c_transport_data. */ if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) || - !conn->c_outgoing) { + !cp->cp_outgoing) { goto rst_nsk; } else { - rds_tcp_reset_callbacks(new_sock, conn); - conn->c_outgoing = 0; + rds_tcp_reset_callbacks(new_sock, cp); + cp->cp_outgoing = 0; /* rds_connect_path_complete() marks RDS_CONN_UP */ - rds_connect_path_complete(conn, RDS_CONN_RESETTING); + rds_connect_path_complete(cp, RDS_CONN_RESETTING); } } else { - rds_tcp_set_callbacks(new_sock, conn); - rds_connect_path_complete(conn, RDS_CONN_CONNECTING); + rds_tcp_set_callbacks(new_sock, cp); + rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } new_sock = NULL; ret = 0; @@ -153,7 +203,7 @@ rst_nsk: ret = 0; out: if (rs_tcp) - mutex_unlock(&rs_tcp->t_conn_lock); + mutex_unlock(&rs_tcp->t_conn_path_lock); if (new_sock) sock_release(new_sock); return ret; @@ -180,6 +230,8 @@ void rds_tcp_listen_data_ready(struct sock *sk) */ if (sk->sk_state == TCP_LISTEN) rds_tcp_accept_work(sk); + else + ready = rds_tcp_listen_sock_def_readable(sock_net(sk)); out: read_unlock_bh(&sk->sk_callback_lock); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 6e6a7111a..ad4892e97 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -147,7 +147,7 @@ static void rds_tcp_cong_recv(struct rds_connection *conn, } struct rds_tcp_desc_arg { - struct rds_connection *conn; + struct rds_conn_path *conn_path; gfp_t gfp; }; @@ -155,8 +155,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct rds_tcp_desc_arg *arg = desc->arg.data; - struct rds_connection *conn = arg->conn; - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_conn_path *cp = arg->conn_path; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct rds_tcp_incoming *tinc = tc->t_tinc; struct sk_buff *clone; size_t left = len, to_copy; @@ -178,7 +178,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, } tc->t_tinc = tinc; rdsdebug("alloced tinc %p\n", tinc); - rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); + rds_inc_path_init(&tinc->ti_inc, cp, + cp->cp_conn->c_faddr); /* * XXX * we might be able to use the __ variants when * we've already serialized at a higher level. @@ -228,6 +229,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, } if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { + struct rds_connection *conn = cp->cp_conn; + if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) rds_tcp_cong_recv(conn, tinc); else @@ -250,15 +253,15 @@ out: } /* the caller has to hold the sock lock */ -static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) +static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; read_descriptor_t desc; struct rds_tcp_desc_arg arg; /* It's like glib in the kernel! */ - arg.conn = conn; + arg.conn_path = cp; arg.gfp = gfp; desc.arg.data = &arg; desc.error = 0; @@ -278,16 +281,17 @@ static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) * if we fail to allocate we're in trouble.. blindly wait some time before * trying again to see if the VM can free up something for us. */ -int rds_tcp_recv(struct rds_connection *conn) +int rds_tcp_recv_path(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; int ret = 0; - rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); + rdsdebug("recv worker path [%d] tc %p sock %p\n", + cp->cp_index, tc, sock); lock_sock(sock->sk); - ret = rds_tcp_read_sock(conn, GFP_KERNEL); + ret = rds_tcp_read_sock(cp, GFP_KERNEL); release_sock(sock->sk); return ret; @@ -296,24 +300,24 @@ int rds_tcp_recv(struct rds_connection *conn) void rds_tcp_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); - struct rds_connection *conn; + struct rds_conn_path *cp; struct rds_tcp_connection *tc; rdsdebug("data ready sk %p\n", sk); read_lock_bh(&sk->sk_callback_lock); - conn = sk->sk_user_data; - if (!conn) { /* check for teardown race */ + cp = sk->sk_user_data; + if (!cp) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } - tc = conn->c_transport_data; + tc = cp->cp_transport_data; ready = tc->t_orig_data_ready; rds_tcp_stats_inc(s_tcp_data_ready_calls); - if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); out: read_unlock_bh(&sk->sk_callback_lock); ready(sk); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 618be69c9..89d09b481 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -34,6 +34,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "tcp.h" @@ -48,16 +49,16 @@ static void rds_tcp_cork(struct socket *sock, int val) set_fs(oldfs); } -void rds_tcp_xmit_prepare(struct rds_connection *conn) +void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; rds_tcp_cork(tc->t_sock, 1); } -void rds_tcp_xmit_complete(struct rds_connection *conn) +void rds_tcp_xmit_path_complete(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; rds_tcp_cork(tc->t_sock, 0); } @@ -80,7 +81,8 @@ static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_conn_path *cp = rm->m_inc.i_conn_path; + struct rds_tcp_connection *tc = cp->cp_transport_data; int done = 0; int ret = 0; int more; @@ -149,10 +151,17 @@ out: rds_tcp_stats_inc(s_tcp_sndbuf_full); ret = 0; } else { - printk(KERN_WARNING "RDS/tcp: send to %pI4 " - "returned %d, disconnecting and reconnecting\n", - &conn->c_faddr, ret); - rds_conn_drop(conn); + /* No need to disconnect/reconnect if path_drop + * has already been triggered, because, e.g., of + * an incoming RST. + */ + if (rds_conn_path_up(cp)) { + pr_warn("RDS/tcp: send to %pI4 on cp [%d]" + "returned %d, " + "disconnecting and reconnecting\n", + &conn->c_faddr, cp->cp_index, ret); + rds_conn_path_drop(cp); + } } } if (done == 0) @@ -177,27 +186,27 @@ static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) void rds_tcp_write_space(struct sock *sk) { void (*write_space)(struct sock *sk); - struct rds_connection *conn; + struct rds_conn_path *cp; struct rds_tcp_connection *tc; read_lock_bh(&sk->sk_callback_lock); - conn = sk->sk_user_data; - if (!conn) { + cp = sk->sk_user_data; + if (!cp) { write_space = sk->sk_write_space; goto out; } - tc = conn->c_transport_data; + tc = cp->cp_transport_data; rdsdebug("write_space for tc %p\n", tc); write_space = tc->t_orig_write_space; rds_tcp_stats_inc(s_tcp_write_space_calls); rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); tc->t_last_seen_una = rds_tcp_snd_una(tc); - rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); + rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked); if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); out: read_unlock_bh(&sk->sk_callback_lock); diff --git a/net/rds/threads.c b/net/rds/threads.c index 4a3230457..e42df11bf 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -71,30 +71,30 @@ struct workqueue_struct *rds_wq; EXPORT_SYMBOL_GPL(rds_wq); -void rds_connect_path_complete(struct rds_connection *conn, int curr) +void rds_connect_path_complete(struct rds_conn_path *cp, int curr) { - if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) { + if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { printk(KERN_WARNING "%s: Cannot transition to state UP, " "current state is %d\n", __func__, - atomic_read(&conn->c_state)); - rds_conn_drop(conn); + atomic_read(&cp->cp_state)); + rds_conn_path_drop(cp); return; } rdsdebug("conn %p for %pI4 to %pI4 complete\n", - conn, &conn->c_laddr, &conn->c_faddr); + cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); - conn->c_reconnect_jiffies = 0; - set_bit(0, &conn->c_map_queued); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + cp->cp_reconnect_jiffies = 0; + set_bit(0, &cp->cp_conn->c_map_queued); + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); } EXPORT_SYMBOL_GPL(rds_connect_path_complete); void rds_connect_complete(struct rds_connection *conn) { - rds_connect_path_complete(conn, RDS_CONN_CONNECTING); + rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING); } EXPORT_SYMBOL_GPL(rds_connect_complete); @@ -116,70 +116,87 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ -void rds_queue_reconnect(struct rds_connection *conn) +void rds_queue_reconnect(struct rds_conn_path *cp) { unsigned long rand; + struct rds_connection *conn = cp->cp_conn; rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", conn, &conn->c_laddr, &conn->c_faddr, - conn->c_reconnect_jiffies); + cp->cp_reconnect_jiffies); - set_bit(RDS_RECONNECT_PENDING, &conn->c_flags); - if (conn->c_reconnect_jiffies == 0) { - conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + /* let peer with smaller addr initiate reconnect, to avoid duels */ + if (conn->c_trans->t_type == RDS_TRANS_TCP && + conn->c_laddr > conn->c_faddr) + return; + + set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); + if (cp->cp_reconnect_jiffies == 0) { + cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; + queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); return; } get_random_bytes(&rand, sizeof(rand)); rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", - rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, + rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); - queue_delayed_work(rds_wq, &conn->c_conn_w, - rand % conn->c_reconnect_jiffies); + queue_delayed_work(rds_wq, &cp->cp_conn_w, + rand % cp->cp_reconnect_jiffies); - conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, + cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, rds_sysctl_reconnect_max_jiffies); } void rds_connect_worker(struct work_struct *work) { - struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work); + struct rds_conn_path *cp = container_of(work, + struct rds_conn_path, + cp_conn_w.work); + struct rds_connection *conn = cp->cp_conn; int ret; - clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); - if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { - ret = conn->c_trans->conn_connect(conn); + if (cp->cp_index > 1 && cp->cp_conn->c_laddr > cp->cp_conn->c_faddr) + return; + clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); + ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); + if (ret) { + ret = conn->c_trans->conn_path_connect(cp); rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", conn, &conn->c_laddr, &conn->c_faddr, ret); if (ret) { - if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) - rds_queue_reconnect(conn); + if (rds_conn_path_transition(cp, + RDS_CONN_CONNECTING, + RDS_CONN_DOWN)) + rds_queue_reconnect(cp); else - rds_conn_error(conn, "RDS: connect failed\n"); + rds_conn_path_error(cp, + "RDS: connect failed\n"); } } } void rds_send_worker(struct work_struct *work) { - struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); + struct rds_conn_path *cp = container_of(work, + struct rds_conn_path, + cp_send_w.work); int ret; - if (rds_conn_state(conn) == RDS_CONN_UP) { - clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); - ret = rds_send_xmit(conn); + if (rds_conn_path_state(cp) == RDS_CONN_UP) { + clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); + ret = rds_send_xmit(cp); cond_resched(); - rdsdebug("conn %p ret %d\n", conn, ret); + rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_send_immediate_retry); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); break; case -ENOMEM: rds_stats_inc(s_send_delayed_retry); - queue_delayed_work(rds_wq, &conn->c_send_w, 2); + queue_delayed_work(rds_wq, &cp->cp_send_w, 2); default: break; } @@ -188,20 +205,22 @@ void rds_send_worker(struct work_struct *work) void rds_recv_worker(struct work_struct *work) { - struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); + struct rds_conn_path *cp = container_of(work, + struct rds_conn_path, + cp_recv_w.work); int ret; - if (rds_conn_state(conn) == RDS_CONN_UP) { - ret = conn->c_trans->recv(conn); - rdsdebug("conn %p ret %d\n", conn, ret); + if (rds_conn_path_state(cp) == RDS_CONN_UP) { + ret = cp->cp_conn->c_trans->recv_path(cp); + rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_recv_immediate_retry); - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); break; case -ENOMEM: rds_stats_inc(s_recv_delayed_retry); - queue_delayed_work(rds_wq, &conn->c_recv_w, 2); + queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); default: break; } @@ -210,9 +229,11 @@ void rds_recv_worker(struct work_struct *work) void rds_shutdown_worker(struct work_struct *work) { - struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + struct rds_conn_path *cp = container_of(work, + struct rds_conn_path, + cp_down_w); - rds_conn_shutdown(conn); + rds_conn_shutdown(cp); } void rds_threads_exit(void) diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index e05a06ef2..10f3f48a1 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -4,25 +4,28 @@ af-rxrpc-y := \ af_rxrpc.o \ - ar-accept.o \ - ar-ack.o \ - ar-call.o \ - ar-connection.o \ - ar-connevent.o \ - ar-error.o \ - ar-input.o \ - ar-key.o \ - ar-local.o \ - ar-output.o \ - ar-peer.o \ - ar-recvmsg.o \ - ar-security.o \ - ar-skbuff.o \ - ar-transport.o \ + call_accept.o \ + call_event.o \ + call_object.o \ + conn_client.o \ + conn_event.o \ + conn_object.o \ + conn_service.o \ + input.o \ insecure.o \ - misc.o + key.o \ + local_event.o \ + local_object.o \ + misc.o \ + output.o \ + peer_event.o \ + peer_object.o \ + recvmsg.o \ + security.o \ + skbuff.o \ + utils.o -af-rxrpc-$(CONFIG_PROC_FS) += ar-proc.o +af-rxrpc-$(CONFIG_PROC_FS) += proc.o af-rxrpc-$(CONFIG_RXKAD) += rxkad.o af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index e45e94ca0..88effadd4 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -31,8 +33,6 @@ unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO; module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(debug, "RxRPC debugging mask"); -static int sysctl_rxrpc_max_qlen __read_mostly = 10; - static struct proto rxrpc_proto; static const struct proto_ops rxrpc_rpc_ops; @@ -97,11 +97,13 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, srx->transport_len > len) return -EINVAL; - if (srx->transport.family != rx->proto) + if (srx->transport.family != rx->family) return -EAFNOSUPPORT; switch (srx->transport.family) { case AF_INET: + if (srx->transport_len < sizeof(struct sockaddr_in)) + return -EINVAL; _debug("INET: %x @ %pI4", ntohs(srx->transport.sin.sin_port), &srx->transport.sin.sin_addr); @@ -137,33 +139,33 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) lock_sock(&rx->sk); - if (rx->sk.sk_state != RXRPC_UNCONNECTED) { + if (rx->sk.sk_state != RXRPC_UNBOUND) { ret = -EINVAL; goto error_unlock; } memcpy(&rx->srx, srx, sizeof(rx->srx)); - /* Find or create a local transport endpoint to use */ local = rxrpc_lookup_local(&rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; } - rx->local = local; - if (srx->srx_service) { + if (rx->srx.srx_service) { write_lock_bh(&local->services_lock); list_for_each_entry(prx, &local->services, listen_link) { - if (prx->srx.srx_service == srx->srx_service) + if (prx->srx.srx_service == rx->srx.srx_service) goto service_in_use; } + rx->local = local; list_add_tail(&rx->listen_link, &local->services); write_unlock_bh(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; } else { + rx->local = local; rx->sk.sk_state = RXRPC_CLIENT_BOUND; } @@ -172,8 +174,9 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) return 0; service_in_use: - ret = -EADDRINUSE; write_unlock_bh(&local->services_lock); + rxrpc_put_local(local); + ret = -EADDRINUSE; error_unlock: release_sock(&rx->sk); error: @@ -188,6 +191,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); + unsigned int max; int ret; _enter("%p,%d", rx, backlog); @@ -195,20 +199,24 @@ static int rxrpc_listen(struct socket *sock, int backlog) lock_sock(&rx->sk); switch (rx->sk.sk_state) { - case RXRPC_UNCONNECTED: + case RXRPC_UNBOUND: ret = -EADDRNOTAVAIL; break; - case RXRPC_CLIENT_BOUND: - case RXRPC_CLIENT_CONNECTED: - default: - ret = -EBUSY; - break; case RXRPC_SERVER_BOUND: ASSERT(rx->local != NULL); + max = READ_ONCE(rxrpc_max_backlog); + ret = -EINVAL; + if (backlog == INT_MAX) + backlog = max; + else if (backlog < 0 || backlog > max) + break; sk->sk_max_ack_backlog = backlog; rx->sk.sk_state = RXRPC_SERVER_LISTENING; ret = 0; break; + default: + ret = -EBUSY; + break; } release_sock(&rx->sk); @@ -216,45 +224,10 @@ static int rxrpc_listen(struct socket *sock, int backlog) return ret; } -/* - * find a transport by address - */ -static struct rxrpc_transport *rxrpc_name_to_transport(struct socket *sock, - struct sockaddr *addr, - int addr_len, int flags, - gfp_t gfp) -{ - struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; - struct rxrpc_transport *trans; - struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - struct rxrpc_peer *peer; - - _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); - - ASSERT(rx->local != NULL); - ASSERT(rx->sk.sk_state > RXRPC_UNCONNECTED); - - if (rx->srx.transport_type != srx->transport_type) - return ERR_PTR(-ESOCKTNOSUPPORT); - if (rx->srx.transport.family != srx->transport.family) - return ERR_PTR(-EAFNOSUPPORT); - - /* find a remote transport endpoint from the local one */ - peer = rxrpc_get_peer(srx, gfp); - if (IS_ERR(peer)) - return ERR_CAST(peer); - - /* find a transport */ - trans = rxrpc_get_transport(rx->local, peer, gfp); - rxrpc_put_peer(peer); - _leave(" = %p", trans); - return trans; -} - /** * rxrpc_kernel_begin_call - Allow a kernel service to begin a call * @sock: The socket on which to make the call - * @srx: The address of the peer to contact (defaults to socket setting) + * @srx: The address of the peer to contact * @key: The security context to use (defaults to socket setting) * @user_call_ID: The ID to use * @@ -271,51 +244,32 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, unsigned long user_call_ID, gfp_t gfp) { - struct rxrpc_conn_bundle *bundle; - struct rxrpc_transport *trans; + struct rxrpc_conn_parameters cp; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + int ret; _enter(",,%x,%lx", key_serial(key), user_call_ID); - lock_sock(&rx->sk); + ret = rxrpc_validate_address(rx, srx, sizeof(*srx)); + if (ret < 0) + return ERR_PTR(ret); - if (srx) { - trans = rxrpc_name_to_transport(sock, (struct sockaddr *) srx, - sizeof(*srx), 0, gfp); - if (IS_ERR(trans)) { - call = ERR_CAST(trans); - trans = NULL; - goto out_notrans; - } - } else { - trans = rx->trans; - if (!trans) { - call = ERR_PTR(-ENOTCONN); - goto out_notrans; - } - atomic_inc(&trans->usage); - } + lock_sock(&rx->sk); - if (!srx) - srx = &rx->srx; if (!key) key = rx->key; if (key && !key->payload.data[0]) key = NULL; /* a no-security key */ - bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp); - if (IS_ERR(bundle)) { - call = ERR_CAST(bundle); - goto out; - } + memset(&cp, 0, sizeof(cp)); + cp.local = rx->local; + cp.key = key; + cp.security_level = 0; + cp.exclusive = false; + cp.service_id = srx->srx_service; + call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); - call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, true, - gfp); - rxrpc_put_bundle(trans, bundle); -out: - rxrpc_put_transport(trans); -out_notrans: release_sock(&rx->sk); _leave(" = %p", call); return call; @@ -367,11 +321,8 @@ EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages); static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags) { - struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; - struct sock *sk = sock->sk; - struct rxrpc_transport *trans; - struct rxrpc_local *local; - struct rxrpc_sock *rx = rxrpc_sk(sk); + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); @@ -384,45 +335,28 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, lock_sock(&rx->sk); + ret = -EISCONN; + if (test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) + goto error; + switch (rx->sk.sk_state) { - case RXRPC_UNCONNECTED: - /* find a local transport endpoint if we don't have one already */ - ASSERTCMP(rx->local, ==, NULL); - rx->srx.srx_family = AF_RXRPC; - rx->srx.srx_service = 0; - rx->srx.transport_type = srx->transport_type; - rx->srx.transport_len = sizeof(sa_family_t); - rx->srx.transport.family = srx->transport.family; - local = rxrpc_lookup_local(&rx->srx); - if (IS_ERR(local)) { - release_sock(&rx->sk); - return PTR_ERR(local); - } - rx->local = local; - rx->sk.sk_state = RXRPC_CLIENT_BOUND; + case RXRPC_UNBOUND: + rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; + case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: break; - case RXRPC_CLIENT_CONNECTED: - release_sock(&rx->sk); - return -EISCONN; default: - release_sock(&rx->sk); - return -EBUSY; /* server sockets can't connect as well */ - } - - trans = rxrpc_name_to_transport(sock, addr, addr_len, flags, - GFP_KERNEL); - if (IS_ERR(trans)) { - release_sock(&rx->sk); - _leave(" = %ld", PTR_ERR(trans)); - return PTR_ERR(trans); + ret = -EBUSY; + goto error; } - rx->trans = trans; - rx->sk.sk_state = RXRPC_CLIENT_CONNECTED; + rx->connect_srx = *srx; + set_bit(RXRPC_SOCK_CONNECTED, &rx->flags); + ret = 0; +error: release_sock(&rx->sk); - return 0; + return ret; } /* @@ -436,7 +370,7 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, */ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { - struct rxrpc_transport *trans; + struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; @@ -453,48 +387,38 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) } } - trans = NULL; lock_sock(&rx->sk); - if (m->msg_name) { - ret = -EISCONN; - trans = rxrpc_name_to_transport(sock, m->msg_name, - m->msg_namelen, 0, GFP_KERNEL); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - } else { - trans = rx->trans; - if (trans) - atomic_inc(&trans->usage); - } - switch (rx->sk.sk_state) { - case RXRPC_SERVER_LISTENING: - if (!m->msg_name) { - ret = rxrpc_server_sendmsg(rx, m, len); - break; + case RXRPC_UNBOUND: + local = rxrpc_lookup_local(&rx->srx); + if (IS_ERR(local)) { + ret = PTR_ERR(local); + goto error_unlock; } - case RXRPC_SERVER_BOUND: + + rx->local = local; + rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; + /* Fall through */ + + case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: - if (!m->msg_name) { - ret = -ENOTCONN; - break; + if (!m->msg_name && + test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) { + m->msg_name = &rx->connect_srx; + m->msg_namelen = sizeof(rx->connect_srx); } - case RXRPC_CLIENT_CONNECTED: - ret = rxrpc_client_sendmsg(rx, trans, m, len); + case RXRPC_SERVER_BOUND: + case RXRPC_SERVER_LISTENING: + ret = rxrpc_do_sendmsg(rx, m, len); break; default: - ret = -ENOTCONN; + ret = -EINVAL; break; } -out: +error_unlock: release_sock(&rx->sk); - if (trans) - rxrpc_put_transport(trans); _leave(" = %d", ret); return ret; } @@ -521,9 +445,9 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (optlen != 0) goto error; ret = -EISCONN; - if (rx->sk.sk_state != RXRPC_UNCONNECTED) + if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; - set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags); + rx->exclusive = true; goto success; case RXRPC_SECURITY_KEY: @@ -531,7 +455,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (rx->key) goto error; ret = -EISCONN; - if (rx->sk.sk_state != RXRPC_UNCONNECTED) + if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = rxrpc_request_key(rx, optval, optlen); goto error; @@ -541,7 +465,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (rx->key) goto error; ret = -EISCONN; - if (rx->sk.sk_state != RXRPC_UNCONNECTED) + if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = rxrpc_server_keyring(rx, optval, optlen); goto error; @@ -551,7 +475,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(unsigned int)) goto error; ret = -EISCONN; - if (rx->sk.sk_state != RXRPC_UNCONNECTED) + if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = get_user(min_sec_level, (unsigned int __user *) optval); @@ -630,13 +554,13 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, return -ENOMEM; sock_init_data(sock, sk); - sk->sk_state = RXRPC_UNCONNECTED; + sk->sk_state = RXRPC_UNBOUND; sk->sk_write_space = rxrpc_write_space; - sk->sk_max_ack_backlog = sysctl_rxrpc_max_qlen; + sk->sk_max_ack_backlog = 0; sk->sk_destruct = rxrpc_sock_destructor; rx = rxrpc_sk(sk); - rx->proto = protocol; + rx->family = protocol; rx->calls = RB_ROOT; INIT_LIST_HEAD(&rx->listen_link); @@ -698,24 +622,8 @@ static int rxrpc_release_sock(struct sock *sk) flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); - if (rx->conn) { - rxrpc_put_connection(rx->conn); - rx->conn = NULL; - } - - if (rx->bundle) { - rxrpc_put_bundle(rx->trans, rx->bundle); - rx->bundle = NULL; - } - if (rx->trans) { - rxrpc_put_transport(rx->trans); - rx->trans = NULL; - } - if (rx->local) { - rxrpc_put_local(rx->local); - rx->local = NULL; - } - + rxrpc_put_local(rx->local); + rx->local = NULL; key_put(rx->key); rx->key = NULL; key_put(rx->securities); @@ -796,49 +704,49 @@ static int __init af_rxrpc_init(void) "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, SLAB_HWCACHE_ALIGN, NULL); if (!rxrpc_call_jar) { - printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n"); + pr_notice("Failed to allocate call jar\n"); goto error_call_jar; } rxrpc_workqueue = alloc_workqueue("krxrpcd", 0, 1); if (!rxrpc_workqueue) { - printk(KERN_NOTICE "RxRPC: Failed to allocate work queue\n"); + pr_notice("Failed to allocate work queue\n"); goto error_work_queue; } ret = rxrpc_init_security(); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot initialise security\n"); + pr_crit("Cannot initialise security\n"); goto error_security; } ret = proto_register(&rxrpc_proto, 1); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register protocol\n"); + pr_crit("Cannot register protocol\n"); goto error_proto; } ret = sock_register(&rxrpc_family_ops); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register socket family\n"); + pr_crit("Cannot register socket family\n"); goto error_sock; } ret = register_key_type(&key_type_rxrpc); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register client key type\n"); + pr_crit("Cannot register client key type\n"); goto error_key_type; } ret = register_key_type(&key_type_rxrpc_s); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register server key type\n"); + pr_crit("Cannot register server key type\n"); goto error_key_type_s; } ret = rxrpc_sysctl_init(); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register sysctls\n"); + pr_crit("Cannot register sysctls\n"); goto error_sysctls; } @@ -858,9 +766,9 @@ error_key_type: error_sock: proto_unregister(&rxrpc_proto); error_proto: - destroy_workqueue(rxrpc_workqueue); -error_security: rxrpc_exit_security(); +error_security: + destroy_workqueue(rxrpc_workqueue); error_work_queue: kmem_cache_destroy(rxrpc_call_jar); error_call_jar: @@ -880,14 +788,9 @@ static void __exit af_rxrpc_exit(void) proto_unregister(&rxrpc_proto); rxrpc_destroy_all_calls(); rxrpc_destroy_all_connections(); - rxrpc_destroy_all_transports(); - rxrpc_destroy_all_peers(); - rxrpc_destroy_all_locals(); - ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); + rxrpc_destroy_all_locals(); - _debug("flush scheduled work"); - flush_workqueue(rxrpc_workqueue); remove_proc_entry("rxrpc_conns", init_net.proc_net); remove_proc_entry("rxrpc_calls", init_net.proc_net); destroy_workqueue(rxrpc_workqueue); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f0b807a16..ff83fb1dd 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -9,7 +9,10 @@ * 2 of the License, or (at your option) any later version. */ +#include +#include #include +#include #include #if 0 @@ -33,15 +36,16 @@ struct rxrpc_crypt { queue_delayed_work(rxrpc_workqueue, (WS), (D)) #define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor) -#define rxrpc_queue_conn(CONN) rxrpc_queue_work(&(CONN)->processor) + +struct rxrpc_connection; /* * sk_state for RxRPC sockets */ enum { - RXRPC_UNCONNECTED = 0, + RXRPC_UNBOUND = 0, + RXRPC_CLIENT_UNBOUND, /* Unbound socket used as client */ RXRPC_CLIENT_BOUND, /* client local address bound */ - RXRPC_CLIENT_CONNECTED, /* client is connected */ RXRPC_SERVER_BOUND, /* server local address bound */ RXRPC_SERVER_LISTENING, /* server listening for connections */ RXRPC_CLOSE, /* socket is being closed */ @@ -55,9 +59,6 @@ struct rxrpc_sock { struct sock sk; rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */ struct rxrpc_local *local; /* local endpoint */ - struct rxrpc_transport *trans; /* transport handler */ - struct rxrpc_conn_bundle *bundle; /* virtual connection bundle */ - struct rxrpc_connection *conn; /* exclusive virtual connection */ struct list_head listen_link; /* link in the local endpoint's listen list */ struct list_head secureq; /* calls awaiting connection security clearance */ struct list_head acceptq; /* calls awaiting acceptance */ @@ -65,12 +66,14 @@ struct rxrpc_sock { struct key *securities; /* list of server security descriptors */ struct rb_root calls; /* outstanding calls on this socket */ unsigned long flags; -#define RXRPC_SOCK_EXCLUSIVE_CONN 1 /* exclusive connection for a client socket */ +#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT + bool exclusive; /* Exclusive connection for a client socket */ + sa_family_t family; /* Protocol family created with */ struct sockaddr_rxrpc srx; /* local address */ - sa_family_t proto; /* protocol created with */ + struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ }; #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) @@ -138,17 +141,16 @@ struct rxrpc_security { int (*init_connection_security)(struct rxrpc_connection *); /* prime a connection's packet security */ - void (*prime_packet_security)(struct rxrpc_connection *); + int (*prime_packet_security)(struct rxrpc_connection *); /* impose security on a packet */ - int (*secure_packet)(const struct rxrpc_call *, + int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t, void *); /* verify the security on a received packet */ - int (*verify_packet)(const struct rxrpc_call *, struct sk_buff *, - u32 *); + int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *); /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); @@ -168,46 +170,52 @@ struct rxrpc_security { }; /* - * RxRPC local transport endpoint definition - * - matched by local port, address and protocol type + * RxRPC local transport endpoint description + * - owned by a single AF_RXRPC socket + * - pointed to by transport socket struct sk_user_data */ struct rxrpc_local { + struct rcu_head rcu; + atomic_t usage; + struct list_head link; struct socket *socket; /* my UDP socket */ - struct work_struct destroyer; /* endpoint destroyer */ - struct work_struct acceptor; /* incoming call processor */ - struct work_struct rejecter; /* packet reject writer */ - struct work_struct event_processor; /* endpoint event processor */ + struct work_struct processor; struct list_head services; /* services listening on this endpoint */ - struct list_head link; /* link in endpoint list */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ + struct rb_root client_conns; /* Client connections by socket params */ + spinlock_t client_conns_lock; /* Lock for client_conns */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ - atomic_t usage; int debug_id; /* debug ID for printks */ - volatile char error_rcvd; /* T if received ICMP error outstanding */ + bool dead; struct sockaddr_rxrpc srx; /* local address */ }; /* * RxRPC remote transport endpoint definition - * - matched by remote port, address and protocol type - * - holds the connection ID counter for connections between the two endpoints + * - matched by local endpoint, remote port, address and protocol type */ struct rxrpc_peer { - struct work_struct destroyer; /* peer destroyer */ - struct list_head link; /* link in master peer list */ - struct list_head error_targets; /* targets for net error distribution */ - spinlock_t lock; /* access lock */ + struct rcu_head rcu; /* This must be first */ atomic_t usage; + unsigned long hash_key; + struct hlist_node hash_link; + struct rxrpc_local *local; + struct hlist_head error_targets; /* targets for net error distribution */ + struct work_struct error_distributor; + struct rb_root service_conns; /* Service connections */ + seqlock_t service_conn_lock; + spinlock_t lock; /* access lock */ unsigned int if_mtu; /* interface MTU for this peer */ unsigned int mtu; /* network MTU for this peer */ unsigned int maxdata; /* data size (MTU - hdrsize) */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ - int net_error; /* network error distributed */ + int error_report; /* Net (+0) or local (+1000000) to distribute */ +#define RXRPC_LOCAL_ERROR_OFFSET 1000000 struct sockaddr_rxrpc srx; /* remote address */ /* calculated RTT cache */ @@ -219,99 +227,108 @@ struct rxrpc_peer { }; /* - * RxRPC point-to-point transport / connection manager definition - * - handles a bundle of connections between two endpoints - * - matched by { local, peer } - */ -struct rxrpc_transport { - struct rxrpc_local *local; /* local transport endpoint */ - struct rxrpc_peer *peer; /* remote transport endpoint */ - struct work_struct error_handler; /* network error distributor */ - struct rb_root bundles; /* client connection bundles on this transport */ - struct rb_root client_conns; /* client connections on this transport */ - struct rb_root server_conns; /* server connections on this transport */ - struct list_head link; /* link in master session list */ - struct sk_buff_head error_queue; /* error packets awaiting processing */ - unsigned long put_time; /* time at which to reap */ - spinlock_t client_lock; /* client connection allocation lock */ - rwlock_t conn_lock; /* lock for active/dead connections */ - atomic_t usage; - int debug_id; /* debug ID for printks */ - unsigned int conn_idcounter; /* connection ID counter (client) */ + * Keys for matching a connection. + */ +struct rxrpc_conn_proto { + union { + struct { + u32 epoch; /* epoch of this connection */ + u32 cid; /* connection ID */ + }; + u64 index_key; + }; +}; + +struct rxrpc_conn_parameters { + struct rxrpc_local *local; /* Representation of local endpoint */ + struct rxrpc_peer *peer; /* Remote endpoint */ + struct key *key; /* Security details */ + bool exclusive; /* T if conn is exclusive */ + u16 service_id; /* Service ID for this connection */ + u32 security_level; /* Security level selected */ }; /* - * RxRPC client connection bundle - * - matched by { transport, service_id, key } + * Bits in the connection flags. */ -struct rxrpc_conn_bundle { - struct rb_node node; /* node in transport's lookup tree */ - struct list_head unused_conns; /* unused connections in this bundle */ - struct list_head avail_conns; /* available connections in this bundle */ - struct list_head busy_conns; /* busy connections in this bundle */ - struct key *key; /* security for this bundle */ - wait_queue_head_t chanwait; /* wait for channel to become available */ - atomic_t usage; - int debug_id; /* debug ID for printks */ - unsigned short num_conns; /* number of connections in this bundle */ - u16 service_id; /* Service ID for this bundle */ - u8 security_ix; /* security type */ +enum rxrpc_conn_flag { + RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ + RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ + RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */ +}; + +/* + * Events that can be raised upon a connection. + */ +enum rxrpc_conn_event { + RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */ +}; + +/* + * The connection protocol state. + */ +enum rxrpc_conn_proto_state { + RXRPC_CONN_UNUSED, /* Connection not yet attempted */ + RXRPC_CONN_CLIENT, /* Client connection */ + RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ + RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ + RXRPC_CONN_SERVICE, /* Service secured connection */ + RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */ + RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */ + RXRPC_CONN_NETWORK_ERROR, /* Conn terminated by network error */ + RXRPC_CONN__NR_STATES }; /* * RxRPC connection definition - * - matched by { transport, service_id, conn_id, direction, key } + * - matched by { local, peer, epoch, conn_id, direction } * - each connection can only handle four simultaneous calls */ struct rxrpc_connection { - struct rxrpc_transport *trans; /* transport session */ - struct rxrpc_conn_bundle *bundle; /* connection bundle (client) */ + struct rxrpc_conn_proto proto; + struct rxrpc_conn_parameters params; + + spinlock_t channel_lock; + + struct rxrpc_channel { + struct rxrpc_call __rcu *call; /* Active call */ + u32 call_id; /* ID of current call */ + u32 call_counter; /* Call ID counter */ + u32 last_call; /* ID of last call */ + u32 last_result; /* Result of last call (0/abort) */ + } channels[RXRPC_MAXCALLS]; + wait_queue_head_t channel_wq; /* queue to wait for channel to become available */ + + struct rcu_head rcu; struct work_struct processor; /* connection event processor */ - struct rb_node node; /* node in transport's lookup tree */ + union { + struct rb_node client_node; /* Node in local->client_conns */ + struct rb_node service_node; /* Node in peer->service_conns */ + }; struct list_head link; /* link in master connection list */ - struct list_head bundle_link; /* link in bundle */ - struct rb_root calls; /* calls on this connection */ struct sk_buff_head rx_queue; /* received conn-level packets */ - struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */ const struct rxrpc_security *security; /* applied security module */ - struct key *key; /* security for this connection (client) */ struct key *server_key; /* security for this service */ struct crypto_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ + unsigned long flags; unsigned long events; -#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ - unsigned long put_time; /* time at which to reap */ - rwlock_t lock; /* access lock */ + unsigned long put_time; /* Time at which last put */ spinlock_t state_lock; /* state-change lock */ atomic_t usage; - enum { /* current state of connection */ - RXRPC_CONN_UNUSED, /* - connection not yet attempted */ - RXRPC_CONN_CLIENT, /* - client connection */ - RXRPC_CONN_SERVER_UNSECURED, /* - server unsecured connection */ - RXRPC_CONN_SERVER_CHALLENGING, /* - server challenging for security */ - RXRPC_CONN_SERVER, /* - server secured connection */ - RXRPC_CONN_REMOTELY_ABORTED, /* - conn aborted by peer */ - RXRPC_CONN_LOCALLY_ABORTED, /* - conn aborted locally */ - RXRPC_CONN_NETWORK_ERROR, /* - conn terminated by network error */ - } state; + enum rxrpc_conn_proto_state state : 8; /* current state of connection */ u32 local_abort; /* local abort code */ u32 remote_abort; /* remote abort code */ int error; /* local error incurred */ int debug_id; /* debug ID for printks */ - unsigned int call_counter; /* call ID counter */ atomic_t serial; /* packet serial number counter */ atomic_t hi_serial; /* highest serial number received */ - u8 avail_calls; /* number of calls available */ + atomic_t avail_chans; /* number of channels available */ u8 size_align; /* data size alignment (for security) */ u8 header_size; /* rxrpc + security header size */ u8 security_size; /* security header size */ - u32 security_level; /* security level negotiated */ u32 security_nonce; /* response re-use preventer */ - u32 epoch; /* epoch of this connection */ - u32 cid; /* connection ID */ - u16 service_id; /* service ID for this connection */ u8 security_ix; /* security type */ - u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; @@ -357,6 +374,8 @@ enum rxrpc_call_event { * The states that a call can be in. */ enum rxrpc_call_state { + RXRPC_CALL_UNINITIALISED, + RXRPC_CALL_CLIENT_AWAIT_CONN, /* - client waiting for connection to become available */ RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ @@ -381,6 +400,7 @@ enum rxrpc_call_state { * - matched by { connection, call_id } */ struct rxrpc_call { + struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_sock *socket; /* socket responsible */ struct timer_list lifetimer; /* lifetime remaining on call */ @@ -390,14 +410,14 @@ struct rxrpc_call { struct work_struct destroyer; /* call destroyer */ struct work_struct processor; /* packet processor and ACK generator */ struct list_head link; /* link in master call list */ - struct list_head error_link; /* link in error distribution list */ + struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* calls awaiting acceptance */ struct rb_node sock_node; /* node in socket call tree */ - struct rb_node conn_node; /* node in connection call tree */ struct sk_buff_head rx_queue; /* received packets */ struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ struct sk_buff *tx_pending; /* Tx socket buffer being filled */ wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */ + __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ unsigned long user_call_ID; /* user-defined call ID */ unsigned long creation_jif; /* time of call creation */ unsigned long flags; @@ -405,10 +425,12 @@ struct rxrpc_call { spinlock_t lock; rwlock_t state_lock; /* lock for state transition */ atomic_t usage; + atomic_t skb_count; /* Outstanding packets on this call */ atomic_t sequence; /* Tx data packet sequence counter */ u32 local_abort; /* local abort code */ u32 remote_abort; /* remote abort code */ - int error; /* local error incurred */ + int error_report; /* Network error (ICMP/local transport) */ + int error; /* Local error incurred */ enum rxrpc_call_state state : 8; /* current state of call */ int debug_id; /* debug ID for printks */ u8 channel; /* connection channel occupied by this call */ @@ -440,19 +462,12 @@ struct rxrpc_call { #define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; - struct hlist_node hash_node; - unsigned long hash_key; /* Full hash key */ - u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */ - struct rxrpc_local *local; /* Local endpoint. Used for hashing. */ - sa_family_t proto; /* Frame protocol */ + u8 in_clientflag; /* Copy of conn->in_clientflag */ + struct rxrpc_local *local; /* Local endpoint. */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 epoch; /* epoch of this connection */ u16 service_id; /* service ID */ - union { /* Peer IP address for hashing */ - __be32 ipv4_addr; - __u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */ - } peer_ip; }; /* @@ -478,21 +493,21 @@ extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; /* - * ar-accept.c + * call_accept.c */ -void rxrpc_accept_incoming_calls(struct work_struct *); +void rxrpc_accept_incoming_calls(struct rxrpc_local *); struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long); int rxrpc_reject_call(struct rxrpc_sock *); /* - * ar-ack.c + * call_event.c */ void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); void rxrpc_process_call(struct work_struct *); /* - * ar-call.c + * call_object.c */ extern unsigned int rxrpc_max_call_lifetime; extern unsigned int rxrpc_dead_call_expiry; @@ -500,72 +515,106 @@ extern struct kmem_cache *rxrpc_call_jar; extern struct list_head rxrpc_calls; extern rwlock_t rxrpc_call_lock; -struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *, - void *, sa_family_t, const void *); -struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, - struct rxrpc_transport *, - struct rxrpc_conn_bundle *, - unsigned long, int, gfp_t); +struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); +struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, + struct rxrpc_conn_parameters *, + struct sockaddr_rxrpc *, + unsigned long, gfp_t); struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_connection *, - struct rxrpc_host_header *); -struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long); + struct sk_buff *); void rxrpc_release_call(struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); void __rxrpc_put_call(struct rxrpc_call *); void __exit rxrpc_destroy_all_calls(void); /* - * ar-connection.c + * conn_client.c + */ +extern struct idr rxrpc_client_conn_ids; + +void rxrpc_destroy_client_conn_ids(void); +int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, + struct sockaddr_rxrpc *, gfp_t); +void rxrpc_unpublish_client_conn(struct rxrpc_connection *); + +/* + * conn_event.c + */ +void rxrpc_process_connection(struct work_struct *); +void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); +void rxrpc_reject_packets(struct rxrpc_local *); + +/* + * conn_object.c */ extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; -struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *, - struct rxrpc_transport *, - struct key *, u16, gfp_t); -void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *); -int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *, - struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t); +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); +struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); +struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, + struct sk_buff *); +void __rxrpc_disconnect_call(struct rxrpc_call *); +void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); -struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, - struct rxrpc_host_header *); -extern struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *); -/* - * ar-connevent.c - */ -void rxrpc_process_connection(struct work_struct *); -void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); -void rxrpc_reject_packets(struct work_struct *); +static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) +{ + return conn->out_clientflag; +} + +static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) +{ + return !rxrpc_conn_is_client(conn); +} + +static inline void rxrpc_get_connection(struct rxrpc_connection *conn) +{ + atomic_inc(&conn->usage); +} + +static inline +struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn) +{ + return atomic_inc_not_zero(&conn->usage) ? conn : NULL; +} + +static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) +{ + if (!rxrpc_get_connection_maybe(conn)) + return false; + if (!rxrpc_queue_work(&conn->processor)) + rxrpc_put_connection(conn); + return true; +} /* - * ar-error.c + * conn_service.c */ -void rxrpc_UDP_error_report(struct sock *); -void rxrpc_UDP_error_handler(struct work_struct *); +struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, + struct sk_buff *); +struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, + struct sockaddr_rxrpc *, + struct sk_buff *); +void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* - * ar-input.c + * input.c */ void rxrpc_data_ready(struct sock *); int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool); void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); /* - * ar-local.c + * insecure.c */ -extern rwlock_t rxrpc_local_lock; - -struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *); -void rxrpc_put_local(struct rxrpc_local *); -void __exit rxrpc_destroy_all_locals(void); +extern const struct rxrpc_security rxrpc_no_security; /* - * ar-key.c + * key.c */ extern struct key_type key_type_rxrpc; extern struct key_type key_type_rxrpc_s; @@ -576,80 +625,108 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, u32); /* - * ar-output.c + * local_event.c */ -extern unsigned int rxrpc_resend_timeout; - -int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); -int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *, - struct msghdr *, size_t); -int rxrpc_server_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); +extern void rxrpc_process_local_events(struct rxrpc_local *); /* - * ar-peer.c + * local_object.c */ -struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t); -void rxrpc_put_peer(struct rxrpc_peer *); -struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *, __be32, __be16); -void __exit rxrpc_destroy_all_peers(void); +struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *); +void __rxrpc_put_local(struct rxrpc_local *); +void __exit rxrpc_destroy_all_locals(void); -/* - * ar-proc.c - */ -extern const char *const rxrpc_call_states[]; -extern const struct file_operations rxrpc_call_seq_fops; -extern const struct file_operations rxrpc_connection_seq_fops; +static inline void rxrpc_get_local(struct rxrpc_local *local) +{ + atomic_inc(&local->usage); +} + +static inline +struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) +{ + return atomic_inc_not_zero(&local->usage) ? local : NULL; +} + +static inline void rxrpc_put_local(struct rxrpc_local *local) +{ + if (local && atomic_dec_and_test(&local->usage)) + __rxrpc_put_local(local); +} + +static inline void rxrpc_queue_local(struct rxrpc_local *local) +{ + rxrpc_queue_work(&local->processor); +} /* - * ar-recvmsg.c + * misc.c */ -void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *); -int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); +extern unsigned int rxrpc_max_backlog __read_mostly; +extern unsigned int rxrpc_requested_ack_delay; +extern unsigned int rxrpc_soft_ack_delay; +extern unsigned int rxrpc_idle_ack_delay; +extern unsigned int rxrpc_rx_window_size; +extern unsigned int rxrpc_rx_mtu; +extern unsigned int rxrpc_rx_jumbo_max; + +extern const char *const rxrpc_pkts[]; +extern const s8 rxrpc_ack_priority[]; + +extern const char *rxrpc_acks(u8 reason); /* - * ar-security.c + * output.c */ -int __init rxrpc_init_security(void); -void rxrpc_exit_security(void); -int rxrpc_init_client_conn_security(struct rxrpc_connection *); -int rxrpc_init_server_conn_security(struct rxrpc_connection *); +extern unsigned int rxrpc_resend_timeout; + +int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *); +int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* - * ar-skbuff.c + * peer_event.c */ -void rxrpc_packet_destructor(struct sk_buff *); +void rxrpc_error_report(struct sock *); +void rxrpc_peer_error_distributor(struct work_struct *); /* - * ar-transport.c + * peer_object.c */ -extern unsigned int rxrpc_transport_expiry; +struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, + const struct sockaddr_rxrpc *); +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, + struct sockaddr_rxrpc *, gfp_t); +struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); -struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, - struct rxrpc_peer *, gfp_t); -void rxrpc_put_transport(struct rxrpc_transport *); -void __exit rxrpc_destroy_all_transports(void); -struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, - struct rxrpc_peer *); +static inline void rxrpc_get_peer(struct rxrpc_peer *peer) +{ + atomic_inc(&peer->usage); +} + +static inline +struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) +{ + return atomic_inc_not_zero(&peer->usage) ? peer : NULL; +} + +extern void __rxrpc_put_peer(struct rxrpc_peer *peer); +static inline void rxrpc_put_peer(struct rxrpc_peer *peer) +{ + if (peer && atomic_dec_and_test(&peer->usage)) + __rxrpc_put_peer(peer); +} /* - * insecure.c + * proc.c */ -extern const struct rxrpc_security rxrpc_no_security; +extern const char *const rxrpc_call_states[]; +extern const struct file_operations rxrpc_call_seq_fops; +extern const struct file_operations rxrpc_connection_seq_fops; /* - * misc.c + * recvmsg.c */ -extern unsigned int rxrpc_requested_ack_delay; -extern unsigned int rxrpc_soft_ack_delay; -extern unsigned int rxrpc_idle_ack_delay; -extern unsigned int rxrpc_rx_window_size; -extern unsigned int rxrpc_rx_mtu; -extern unsigned int rxrpc_rx_jumbo_max; - -extern const char *const rxrpc_pkts[]; -extern const s8 rxrpc_ack_priority[]; - -extern const char *rxrpc_acks(u8 reason); +void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *); +int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* * rxkad.c @@ -658,6 +735,19 @@ extern const char *rxrpc_acks(u8 reason); extern const struct rxrpc_security rxkad; #endif +/* + * security.c + */ +int __init rxrpc_init_security(void); +void rxrpc_exit_security(void); +int rxrpc_init_client_conn_security(struct rxrpc_connection *); +int rxrpc_init_server_conn_security(struct rxrpc_connection *); + +/* + * skbuff.c + */ +void rxrpc_packet_destructor(struct sk_buff *); + /* * sysctl.c */ @@ -669,6 +759,11 @@ static inline int __init rxrpc_sysctl_init(void) { return 0; } static inline void rxrpc_sysctl_exit(void) {} #endif +/* + * utils.c + */ +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); + /* * debug tracing */ @@ -744,21 +839,18 @@ do { \ #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTCMP(X, OP, Y) \ do { \ - if (unlikely(!((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "RxRPC: Assertion failed\n"); \ - printk(KERN_ERR "%lu " #OP " %lu is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ + unsigned long _x = (unsigned long)(X); \ + unsigned long _y = (unsigned long)(Y); \ + if (unlikely(!(_x OP _y))) { \ + pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ + _x, _x, #OP, _y, _y); \ BUG(); \ } \ } while (0) @@ -766,21 +858,18 @@ do { \ #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ - if (unlikely((C) && !((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "RxRPC: Assertion failed\n"); \ - printk(KERN_ERR "%lu " #OP " %lu is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ + unsigned long _x = (unsigned long)(X); \ + unsigned long _y = (unsigned long)(Y); \ + if (unlikely((C) && !(_x OP _y))) { \ + pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ + _x, _x, #OP, _y, _y); \ BUG(); \ } \ } while (0) @@ -844,15 +933,6 @@ static inline void rxrpc_purge_queue(struct sk_buff_head *list) rxrpc_free_skb(skb); } -static inline void __rxrpc_get_local(struct rxrpc_local *local, const char *f) -{ - CHECK_SLAB_OKAY(&local->usage); - if (atomic_inc_return(&local->usage) == 1) - printk("resurrected (%s)\n", f); -} - -#define rxrpc_get_local(LOCAL) __rxrpc_get_local((LOCAL), __func__) - #define rxrpc_get_call(CALL) \ do { \ CHECK_SLAB_OKAY(&(CALL)->usage); \ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c new file mode 100644 index 000000000..9bae21e66 --- /dev/null +++ b/net/rxrpc/call_accept.c @@ -0,0 +1,477 @@ +/* incoming call handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * generate a connection-level abort + */ +static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, + struct rxrpc_wire_header *whdr) +{ + struct msghdr msg; + struct kvec iov[1]; + size_t len; + int ret; + + _enter("%d,,", local->debug_id); + + whdr->type = RXRPC_PACKET_TYPE_BUSY; + whdr->serial = htonl(1); + + msg.msg_name = &srx->transport.sin; + msg.msg_namelen = sizeof(srx->transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + iov[0].iov_base = whdr; + iov[0].iov_len = sizeof(*whdr); + + len = iov[0].iov_len; + + _proto("Tx BUSY %%1"); + + ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); + if (ret < 0) { + _leave(" = -EAGAIN [sendmsg failed: %d]", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * accept an incoming call that needs peer, transport and/or connection setting + * up + */ +static int rxrpc_accept_incoming_call(struct rxrpc_local *local, + struct rxrpc_sock *rx, + struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp, *nsp; + struct rxrpc_call *call; + struct sk_buff *notification; + int ret; + + _enter(""); + + sp = rxrpc_skb(skb); + + /* get a notification message to send to the server app */ + notification = alloc_skb(0, GFP_NOFS); + if (!notification) { + _debug("no memory"); + ret = -ENOMEM; + goto error_nofree; + } + rxrpc_new_skb(notification); + notification->mark = RXRPC_SKB_MARK_NEW_CALL; + + conn = rxrpc_incoming_connection(local, srx, skb); + if (IS_ERR(conn)) { + _debug("no conn"); + ret = PTR_ERR(conn); + goto error; + } + + call = rxrpc_incoming_call(rx, conn, skb); + rxrpc_put_connection(conn); + if (IS_ERR(call)) { + _debug("no call"); + ret = PTR_ERR(call); + goto error; + } + + /* attach the call to the socket */ + read_lock_bh(&local->services_lock); + if (rx->sk.sk_state == RXRPC_CLOSE) + goto invalid_service; + + write_lock(&rx->call_lock); + if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { + rxrpc_get_call(call); + + spin_lock(&call->conn->state_lock); + if (sp->hdr.securityIndex > 0 && + call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) { + _debug("await conn sec"); + list_add_tail(&call->accept_link, &rx->secureq); + call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING; + set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); + rxrpc_queue_conn(call->conn); + } else { + _debug("conn ready"); + call->state = RXRPC_CALL_SERVER_ACCEPTING; + list_add_tail(&call->accept_link, &rx->acceptq); + rxrpc_get_call(call); + atomic_inc(&call->skb_count); + nsp = rxrpc_skb(notification); + nsp->call = call; + + ASSERTCMP(atomic_read(&call->usage), >=, 3); + + _debug("notify"); + spin_lock(&call->lock); + ret = rxrpc_queue_rcv_skb(call, notification, true, + false); + spin_unlock(&call->lock); + notification = NULL; + BUG_ON(ret < 0); + } + spin_unlock(&call->conn->state_lock); + + _debug("queued"); + } + write_unlock(&rx->call_lock); + + _debug("process"); + rxrpc_fast_process_packet(call, skb); + + _debug("done"); + read_unlock_bh(&local->services_lock); + rxrpc_free_skb(notification); + rxrpc_put_call(call); + _leave(" = 0"); + return 0; + +invalid_service: + _debug("invalid"); + read_unlock_bh(&local->services_lock); + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { + rxrpc_get_call(call); + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); + rxrpc_put_call(call); + ret = -ECONNREFUSED; +error: + rxrpc_free_skb(notification); +error_nofree: + _leave(" = %d", ret); + return ret; +} + +/* + * accept incoming calls that need peer, transport and/or connection setting up + * - the packets we get are all incoming client DATA packets that have seq == 1 + */ +void rxrpc_accept_incoming_calls(struct rxrpc_local *local) +{ + struct rxrpc_skb_priv *sp; + struct sockaddr_rxrpc srx; + struct rxrpc_sock *rx; + struct rxrpc_wire_header whdr; + struct sk_buff *skb; + int ret; + + _enter("%d", local->debug_id); + + skb = skb_dequeue(&local->accept_queue); + if (!skb) { + _leave("\n"); + return; + } + + _net("incoming call skb %p", skb); + + sp = rxrpc_skb(skb); + + /* Set up a response packet header in case we need it */ + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = 0; + whdr.flags = 0; + whdr.type = 0; + whdr.userStatus = 0; + whdr.securityIndex = sp->hdr.securityIndex; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) + goto drop; + + /* get the socket providing the service */ + read_lock_bh(&local->services_lock); + list_for_each_entry(rx, &local->services, listen_link) { + if (rx->srx.srx_service == sp->hdr.serviceId && + rx->sk.sk_state != RXRPC_CLOSE) + goto found_service; + } + read_unlock_bh(&local->services_lock); + goto invalid_service; + +found_service: + _debug("found service %hd", rx->srx.srx_service); + if (sk_acceptq_is_full(&rx->sk)) + goto backlog_full; + sk_acceptq_added(&rx->sk); + sock_hold(&rx->sk); + read_unlock_bh(&local->services_lock); + + ret = rxrpc_accept_incoming_call(local, rx, skb, &srx); + if (ret < 0) + sk_acceptq_removed(&rx->sk); + sock_put(&rx->sk); + switch (ret) { + case -ECONNRESET: /* old calls are ignored */ + case -ECONNABORTED: /* aborted calls are reaborted or ignored */ + case 0: + return; + case -ECONNREFUSED: + goto invalid_service; + case -EBUSY: + goto busy; + case -EKEYREJECTED: + goto security_mismatch; + default: + BUG(); + } + +backlog_full: + read_unlock_bh(&local->services_lock); +busy: + rxrpc_busy(local, &srx, &whdr); + rxrpc_free_skb(skb); + return; + +drop: + rxrpc_free_skb(skb); + return; + +invalid_service: + skb->priority = RX_INVALID_OPERATION; + rxrpc_reject_packet(local, skb); + return; + + /* can't change connection security type mid-flow */ +security_mismatch: + skb->priority = RX_PROTOCOL_ERROR; + rxrpc_reject_packet(local, skb); + return; +} + +/* + * handle acceptance of a call by userspace + * - assign the user call ID to the call at the front of the queue + */ +struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *parent, **pp; + int ret; + + _enter(",%lx", user_call_ID); + + ASSERT(!irqs_disabled()); + + write_lock(&rx->call_lock); + + ret = -ENODATA; + if (list_empty(&rx->acceptq)) + goto out; + + /* check the user ID isn't already in use */ + ret = -EBADSLT; + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + goto out; + } + + /* dequeue the first call and check it's still valid */ + call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + break; + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_LOCALLY_ABORTED: + ret = -ECONNABORTED; + goto out_release; + case RXRPC_CALL_NETWORK_ERROR: + ret = call->conn->error; + goto out_release; + case RXRPC_CALL_DEAD: + ret = -ETIME; + goto out_discard; + default: + BUG(); + } + + /* formalise the acceptance */ + call->user_call_ID = user_call_ID; + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + BUG(); + if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) + BUG(); + rxrpc_queue_call(call); + + rxrpc_get_call(call); + write_unlock_bh(&call->state_lock); + write_unlock(&rx->call_lock); + _leave(" = %p{%d}", call, call->debug_id); + return call; + + /* if the call is already dying or dead, then we leave the socket's ref + * on it to be released by rxrpc_dead_call_expired() as induced by + * rxrpc_release_call() */ +out_release: + _debug("release %p", call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + rxrpc_queue_call(call); +out_discard: + write_unlock_bh(&call->state_lock); + _debug("discard %p", call); +out: + write_unlock(&rx->call_lock); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * Handle rejection of a call by userspace + * - reject the call at the front of the queue + */ +int rxrpc_reject_call(struct rxrpc_sock *rx) +{ + struct rxrpc_call *call; + int ret; + + _enter(""); + + ASSERT(!irqs_disabled()); + + write_lock(&rx->call_lock); + + ret = -ENODATA; + if (list_empty(&rx->acceptq)) + goto out; + + /* dequeue the first call and check it's still valid */ + call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + call->state = RXRPC_CALL_SERVER_BUSY; + if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) + rxrpc_queue_call(call); + ret = 0; + goto out_release; + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_LOCALLY_ABORTED: + ret = -ECONNABORTED; + goto out_release; + case RXRPC_CALL_NETWORK_ERROR: + ret = call->conn->error; + goto out_release; + case RXRPC_CALL_DEAD: + ret = -ETIME; + goto out_discard; + default: + BUG(); + } + + /* if the call is already dying or dead, then we leave the socket's ref + * on it to be released by rxrpc_dead_call_expired() as induced by + * rxrpc_release_call() */ +out_release: + _debug("release %p", call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + rxrpc_queue_call(call); +out_discard: + write_unlock_bh(&call->state_lock); + _debug("discard %p", call); +out: + write_unlock(&rx->call_lock); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call + * @sock: The socket on which the impending call is waiting + * @user_call_ID: The tag to attach to the call + * + * Allow a kernel service to accept an incoming call, assuming the incoming + * call is still valid. + */ +struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + + _enter(",%lx", user_call_ID); + call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID); + _leave(" = %p", call); + return call; +} +EXPORT_SYMBOL(rxrpc_kernel_accept_call); + +/** + * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call + * @sock: The socket on which the impending call is waiting + * + * Allow a kernel service to reject an incoming call with a BUSY message, + * assuming the incoming call is still valid. + */ +int rxrpc_kernel_reject_call(struct socket *sock) +{ + int ret; + + _enter(""); + ret = rxrpc_reject_call(rxrpc_sk(sock->sk)); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(rxrpc_kernel_reject_call); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c new file mode 100644 index 000000000..e60cf65c2 --- /dev/null +++ b/net/rxrpc/call_event.c @@ -0,0 +1,1302 @@ +/* Management of Tx window, Tx resend, ACKs and out-of-sequence reception + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * propose an ACK be sent + */ +void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + u32 serial, bool immediate) +{ + unsigned long expiry; + s8 prior = rxrpc_ack_priority[ack_reason]; + + ASSERTCMP(prior, >, 0); + + _enter("{%d},%s,%%%x,%u", + call->debug_id, rxrpc_acks(ack_reason), serial, immediate); + + if (prior < rxrpc_ack_priority[call->ackr_reason]) { + if (immediate) + goto cancel_timer; + return; + } + + /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial + * numbers */ + if (prior == rxrpc_ack_priority[call->ackr_reason]) { + if (prior <= 4) + call->ackr_serial = serial; + if (immediate) + goto cancel_timer; + return; + } + + call->ackr_reason = ack_reason; + call->ackr_serial = serial; + + switch (ack_reason) { + case RXRPC_ACK_DELAY: + _debug("run delay timer"); + expiry = rxrpc_soft_ack_delay; + goto run_timer; + + case RXRPC_ACK_IDLE: + if (!immediate) { + _debug("run defer timer"); + expiry = rxrpc_idle_ack_delay; + goto run_timer; + } + goto cancel_timer; + + case RXRPC_ACK_REQUESTED: + expiry = rxrpc_requested_ack_delay; + if (!expiry) + goto cancel_timer; + if (!immediate || serial == 1) { + _debug("run defer timer"); + goto run_timer; + } + + default: + _debug("immediate ACK"); + goto cancel_timer; + } + +run_timer: + expiry += jiffies; + if (!timer_pending(&call->ack_timer) || + time_after(call->ack_timer.expires, expiry)) + mod_timer(&call->ack_timer, expiry); + return; + +cancel_timer: + _debug("cancel timer %%%u", serial); + try_to_del_timer_sync(&call->ack_timer); + read_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * propose an ACK be sent, locking the call structure + */ +void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + u32 serial, bool immediate) +{ + s8 prior = rxrpc_ack_priority[ack_reason]; + + if (prior > rxrpc_ack_priority[call->ackr_reason]) { + spin_lock_bh(&call->lock); + __rxrpc_propose_ACK(call, ack_reason, serial, immediate); + spin_unlock_bh(&call->lock); + } +} + +/* + * set the resend timer + */ +static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, + unsigned long resend_at) +{ + read_lock_bh(&call->state_lock); + if (call->state >= RXRPC_CALL_COMPLETE) + resend = 0; + + if (resend & 1) { + _debug("SET RESEND"); + set_bit(RXRPC_CALL_EV_RESEND, &call->events); + } + + if (resend & 2) { + _debug("MODIFY RESEND TIMER"); + set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + mod_timer(&call->resend_timer, resend_at); + } else { + _debug("KILL RESEND TIMER"); + del_timer_sync(&call->resend_timer); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + read_unlock_bh(&call->state_lock); +} + +/* + * resend packets + */ +static void rxrpc_resend(struct rxrpc_call *call) +{ + struct rxrpc_wire_header *whdr; + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + bool stop; + int loop; + u8 resend; + + _enter("{%d,%d,%d,%d},", + call->acks_hard, call->acks_unacked, + atomic_read(&call->sequence), + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); + + stop = false; + resend = 0; + resend_at = 0; + + for (loop = call->acks_tail; + loop != call->acks_head || stop; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + if (*p_txb & 1) + continue; + + txb = (struct sk_buff *) *p_txb; + sp = rxrpc_skb(txb); + + if (sp->need_resend) { + sp->need_resend = false; + + /* each Tx packet has a new serial number */ + sp->hdr.serial = atomic_inc_return(&call->conn->serial); + + whdr = (struct rxrpc_wire_header *)txb->head; + whdr->serial = htonl(sp->hdr.serial); + + _proto("Tx DATA %%%u { #%d }", + sp->hdr.serial, sp->hdr.seq); + if (rxrpc_send_data_packet(call->conn, txb) < 0) { + stop = true; + sp->resend_at = jiffies + 3; + } else { + sp->resend_at = + jiffies + rxrpc_resend_timeout; + } + } + + if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = true; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(""); +} + +/* + * handle resend timer expiry + */ +static void rxrpc_resend_timer(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop; + u8 resend; + + _enter("%d,%d,%d", + call->acks_tail, call->acks_unacked, call->acks_head); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + resend = 0; + resend_at = 0; + + for (loop = call->acks_unacked; + loop != call->acks_head; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + ASSERT(!(*p_txb & 1)); + + if (sp->need_resend) { + ; + } else if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = true; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(""); +} + +/* + * process soft ACKs of our transmitted packets + * - these indicate packets the peer has or has not received, but hasn't yet + * given to the consumer, and so can still be discarded and re-requested + */ +static int rxrpc_process_soft_ACKs(struct rxrpc_call *call, + struct rxrpc_ackpacket *ack, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop; + u8 sacks[RXRPC_MAXACKS], resend; + + _enter("{%d,%d},{%d},", + call->acks_hard, + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz), + ack->nAcks); + + if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0) + goto protocol_error; + + resend = 0; + resend_at = 0; + for (loop = 0; loop < ack->nAcks; loop++) { + p_txb = call->acks_window; + p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1); + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + switch (sacks[loop]) { + case RXRPC_ACK_TYPE_ACK: + sp->need_resend = false; + *p_txb |= 1; + break; + case RXRPC_ACK_TYPE_NACK: + sp->need_resend = true; + *p_txb &= ~1; + resend = 1; + break; + default: + _debug("Unsupported ACK type %d", sacks[loop]); + goto protocol_error; + } + } + + smp_mb(); + call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1); + + /* anything not explicitly ACK'd is implicitly NACK'd, but may just not + * have been received or processed yet by the far end */ + for (loop = call->acks_unacked; + loop != call->acks_head; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + if (*p_txb & 1) { + /* packet must have been discarded */ + sp->need_resend = true; + *p_txb &= ~1; + resend |= 1; + } else if (sp->need_resend) { + ; + } else if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = true; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(" = 0"); + return 0; + +protocol_error: + _leave(" = -EPROTO"); + return -EPROTO; +} + +/* + * discard hard-ACK'd packets from the Tx window + */ +static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) +{ + unsigned long _skb; + int tail = call->acks_tail, old_tail; + int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); + + _enter("{%u,%u},%u", call->acks_hard, win, hard); + + ASSERTCMP(hard - call->acks_hard, <=, win); + + while (call->acks_hard < hard) { + smp_read_barrier_depends(); + _skb = call->acks_window[tail] & ~1; + rxrpc_free_skb((struct sk_buff *) _skb); + old_tail = tail; + tail = (tail + 1) & (call->acks_winsz - 1); + call->acks_tail = tail; + if (call->acks_unacked == old_tail) + call->acks_unacked = tail; + call->acks_hard++; + } + + wake_up(&call->tx_waitq); +} + +/* + * clear the Tx window in the event of a failure + */ +static void rxrpc_clear_tx_window(struct rxrpc_call *call) +{ + rxrpc_rotate_tx_window(call, atomic_read(&call->sequence)); +} + +/* + * drain the out of sequence received packet queue into the packet Rx queue + */ +static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + bool terminal; + int ret; + + _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos); + + spin_lock_bh(&call->lock); + + ret = -ECONNRESET; + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) + goto socket_unavailable; + + skb = skb_dequeue(&call->rx_oos_queue); + if (skb) { + sp = rxrpc_skb(skb); + + _debug("drain OOS packet %d [%d]", + sp->hdr.seq, call->rx_first_oos); + + if (sp->hdr.seq != call->rx_first_oos) { + skb_queue_head(&call->rx_oos_queue, skb); + call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; + _debug("requeue %p {%u}", skb, call->rx_first_oos); + } else { + skb->mark = RXRPC_SKB_MARK_DATA; + terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && + !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); + ret = rxrpc_queue_rcv_skb(call, skb, true, terminal); + BUG_ON(ret < 0); + _debug("drain #%u", call->rx_data_post); + call->rx_data_post++; + + /* find out what the next packet is */ + skb = skb_peek(&call->rx_oos_queue); + if (skb) + call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; + else + call->rx_first_oos = 0; + _debug("peek %p {%u}", skb, call->rx_first_oos); + } + } + + ret = 0; +socket_unavailable: + spin_unlock_bh(&call->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * insert an out of sequence packet into the buffer + */ +static void rxrpc_insert_oos_packet(struct rxrpc_call *call, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp, *psp; + struct sk_buff *p; + u32 seq; + + sp = rxrpc_skb(skb); + seq = sp->hdr.seq; + _enter(",,{%u}", seq); + + skb->destructor = rxrpc_packet_destructor; + ASSERTCMP(sp->call, ==, NULL); + sp->call = call; + rxrpc_get_call(call); + atomic_inc(&call->skb_count); + + /* insert into the buffer in sequence order */ + spin_lock_bh(&call->lock); + + skb_queue_walk(&call->rx_oos_queue, p) { + psp = rxrpc_skb(p); + if (psp->hdr.seq > seq) { + _debug("insert oos #%u before #%u", seq, psp->hdr.seq); + skb_insert(p, skb, &call->rx_oos_queue); + goto inserted; + } + } + + _debug("append oos #%u", seq); + skb_queue_tail(&call->rx_oos_queue, skb); +inserted: + + /* we might now have a new front to the queue */ + if (call->rx_first_oos == 0 || seq < call->rx_first_oos) + call->rx_first_oos = seq; + + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + call->rx_data_post == call->rx_first_oos) { + _debug("drain rx oos now"); + set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); + } + read_unlock(&call->state_lock); + + spin_unlock_bh(&call->lock); + _leave(" [stored #%u]", call->rx_first_oos); +} + +/* + * clear the Tx window on final ACK reception + */ +static void rxrpc_zap_tx_window(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + unsigned long _skb, *acks_window; + u8 winsz = call->acks_winsz; + int tail; + + acks_window = call->acks_window; + call->acks_window = NULL; + + while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) { + tail = call->acks_tail; + smp_read_barrier_depends(); + _skb = acks_window[tail] & ~1; + smp_mb(); + call->acks_tail = (call->acks_tail + 1) & (winsz - 1); + + skb = (struct sk_buff *) _skb; + sp = rxrpc_skb(skb); + _debug("+++ clear Tx %u", sp->hdr.seq); + rxrpc_free_skb(skb); + } + + kfree(acks_window); +} + +/* + * process the extra information that may be appended to an ACK packet + */ +static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int latest, int nAcks) +{ + struct rxrpc_ackinfo ackinfo; + struct rxrpc_peer *peer; + unsigned int mtu; + + if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) { + _leave(" [no ackinfo]"); + return; + } + + _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", + latest, + ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU), + ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max)); + + mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU)); + + peer = call->conn->params.peer; + if (mtu < peer->maxdata) { + spin_lock_bh(&peer->lock); + peer->maxdata = mtu; + peer->mtu = mtu + peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); + } +} + +/* + * process packets in the reception queue + */ +static int rxrpc_process_rx_queue(struct rxrpc_call *call, + u32 *_abort_code) +{ + struct rxrpc_ackpacket ack; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + bool post_ACK; + int latest; + u32 hard, tx; + + _enter(""); + +process_further: + skb = skb_dequeue(&call->rx_queue); + if (!skb) + return -EAGAIN; + + _net("deferred skb %p", skb); + + sp = rxrpc_skb(skb); + + _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state); + + post_ACK = false; + + switch (sp->hdr.type) { + /* data packets that wind up here have been received out of + * order, need security processing or are jumbo packets */ + case RXRPC_PACKET_TYPE_DATA: + _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); + + /* secured packets must be verified and possibly decrypted */ + if (call->conn->security->verify_packet(call, skb, + _abort_code) < 0) + goto protocol_error; + + rxrpc_insert_oos_packet(call, skb); + goto process_further; + + /* partial ACK to process */ + case RXRPC_PACKET_TYPE_ACK: + if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) { + _debug("extraction failure"); + goto protocol_error; + } + if (!skb_pull(skb, sizeof(ack))) + BUG(); + + latest = sp->hdr.serial; + hard = ntohl(ack.firstPacket); + tx = atomic_read(&call->sequence); + + _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + latest, + ntohs(ack.maxSkew), + hard, + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks(ack.reason), + ack.nAcks); + + rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks); + + if (ack.reason == RXRPC_ACK_PING) { + _proto("Rx ACK %%%u PING Request", latest); + rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, + sp->hdr.serial, true); + } + + /* discard any out-of-order or duplicate ACKs */ + if (latest - call->acks_latest <= 0) { + _debug("discard ACK %d <= %d", + latest, call->acks_latest); + goto discard; + } + call->acks_latest = latest; + + if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY && + call->state != RXRPC_CALL_SERVER_SEND_REPLY && + call->state != RXRPC_CALL_SERVER_AWAIT_ACK) + goto discard; + + _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state); + + if (hard > 0) { + if (hard - 1 > tx) { + _debug("hard-ACK'd packet %d not transmitted" + " (%d top)", + hard - 1, tx); + goto protocol_error; + } + + if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || + call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && + hard > tx) { + call->acks_hard = tx; + goto all_acked; + } + + smp_rmb(); + rxrpc_rotate_tx_window(call, hard - 1); + } + + if (ack.nAcks > 0) { + if (hard - 1 + ack.nAcks > tx) { + _debug("soft-ACK'd packet %d+%d not" + " transmitted (%d top)", + hard - 1, ack.nAcks, tx); + goto protocol_error; + } + + if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0) + goto protocol_error; + } + goto discard; + + /* complete ACK to process */ + case RXRPC_PACKET_TYPE_ACKALL: + goto all_acked; + + /* abort and busy are handled elsewhere */ + case RXRPC_PACKET_TYPE_BUSY: + case RXRPC_PACKET_TYPE_ABORT: + BUG(); + + /* connection level events - also handled elsewhere */ + case RXRPC_PACKET_TYPE_CHALLENGE: + case RXRPC_PACKET_TYPE_RESPONSE: + case RXRPC_PACKET_TYPE_DEBUG: + BUG(); + } + + /* if we've had a hard ACK that covers all the packets we've sent, then + * that ends that phase of the operation */ +all_acked: + write_lock_bh(&call->state_lock); + _debug("ack all %d", call->state); + + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + break; + case RXRPC_CALL_SERVER_AWAIT_ACK: + _debug("srv complete"); + call->state = RXRPC_CALL_COMPLETE; + post_ACK = true; + break; + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_SERVER_RECV_REQUEST: + goto protocol_error_unlock; /* can't occur yet */ + default: + write_unlock_bh(&call->state_lock); + goto discard; /* assume packet left over from earlier phase */ + } + + write_unlock_bh(&call->state_lock); + + /* if all the packets we sent are hard-ACK'd, then we can discard + * whatever we've got left */ + _debug("clear Tx %d", + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); + + del_timer_sync(&call->resend_timer); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + + if (call->acks_window) + rxrpc_zap_tx_window(call); + + if (post_ACK) { + /* post the final ACK message for userspace to pick up */ + _debug("post ACK"); + skb->mark = RXRPC_SKB_MARK_FINAL_ACK; + sp->call = call; + rxrpc_get_call(call); + atomic_inc(&call->skb_count); + spin_lock_bh(&call->lock); + if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) + BUG(); + spin_unlock_bh(&call->lock); + goto process_further; + } + +discard: + rxrpc_free_skb(skb); + goto process_further; + +protocol_error_unlock: + write_unlock_bh(&call->state_lock); +protocol_error: + rxrpc_free_skb(skb); + _leave(" = -EPROTO"); + return -EPROTO; +} + +/* + * post a message to the socket Rx queue for recvmsg() to pick up + */ +static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, + bool fatal) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + _enter("{%d,%lx},%u,%u,%d", + call->debug_id, call->flags, mark, error, fatal); + + /* remove timers and things for fatal messages */ + if (fatal) { + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + + if (mark != RXRPC_SKB_MARK_NEW_CALL && + !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + _leave("[no userid]"); + return 0; + } + + if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { + skb = alloc_skb(0, GFP_NOFS); + if (!skb) + return -ENOMEM; + + rxrpc_new_skb(skb); + + skb->mark = mark; + + sp = rxrpc_skb(skb); + memset(sp, 0, sizeof(*sp)); + sp->error = error; + sp->call = call; + rxrpc_get_call(call); + atomic_inc(&call->skb_count); + + spin_lock_bh(&call->lock); + ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); + spin_unlock_bh(&call->lock); + BUG_ON(ret < 0); + } + + return 0; +} + +/* + * handle background processing of incoming call packets and ACK / abort + * generation + */ +void rxrpc_process_call(struct work_struct *work) +{ + struct rxrpc_call *call = + container_of(work, struct rxrpc_call, processor); + struct rxrpc_wire_header whdr; + struct rxrpc_ackpacket ack; + struct rxrpc_ackinfo ackinfo; + struct msghdr msg; + struct kvec iov[5]; + enum rxrpc_call_event genbit; + unsigned long bits; + __be32 data, pad; + size_t len; + int loop, nbit, ioc, ret, mtu; + u32 serial, abort_code = RX_PROTOCOL_ERROR; + u8 *acks = NULL; + + //printk("\n--------------------\n"); + _enter("{%d,%s,%lx} [%lu]", + call->debug_id, rxrpc_call_states[call->state], call->events, + (jiffies - call->creation_jif) / (HZ / 10)); + + if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) { + _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX"); + return; + } + + if (!call->conn) + goto skip_msg_init; + + /* there's a good chance we're going to have to send a message, so set + * one up in advance */ + msg.msg_name = &call->conn->params.peer->srx.transport; + msg.msg_namelen = call->conn->params.peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(call->conn->proto.epoch); + whdr.cid = htonl(call->cid); + whdr.callNumber = htonl(call->call_id); + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ACK; + whdr.flags = call->conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = call->conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(call->service_id); + + memset(iov, 0, sizeof(iov)); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); +skip_msg_init: + + /* deal with events of a final nature */ + if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { + enum rxrpc_skb_mark mark; + int error; + + clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_ABORT, &call->events); + + error = call->error_report; + if (error < RXRPC_LOCAL_ERROR_OFFSET) { + mark = RXRPC_SKB_MARK_NET_ERROR; + _debug("post net error %d", error); + } else { + mark = RXRPC_SKB_MARK_LOCAL_ERROR; + error -= RXRPC_LOCAL_ERROR_OFFSET; + _debug("post net local error %d", error); + } + + if (rxrpc_post_message(call, mark, error, true) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); + goto kill_ACKs; + } + + if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) { + ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); + + clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_ABORT, &call->events); + + _debug("post conn abort"); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + call->conn->error, true) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); + goto kill_ACKs; + } + + if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) { + whdr.type = RXRPC_PACKET_TYPE_BUSY; + genbit = RXRPC_CALL_EV_REJECT_BUSY; + goto send_message; + } + + if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) { + ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + ECONNABORTED, true) < 0) + goto no_mem; + whdr.type = RXRPC_PACKET_TYPE_ABORT; + data = htonl(call->local_abort); + iov[1].iov_base = &data; + iov[1].iov_len = sizeof(data); + genbit = RXRPC_CALL_EV_ABORT; + goto send_message; + } + + if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) { + genbit = RXRPC_CALL_EV_ACK_FINAL; + + ack.bufferSpace = htons(8); + ack.maxSkew = 0; + ack.serial = 0; + ack.reason = RXRPC_ACK_IDLE; + ack.nAcks = 0; + call->ackr_reason = 0; + + spin_lock_bh(&call->lock); + ack.serial = htonl(call->ackr_serial); + ack.previousPacket = htonl(call->ackr_prev_seq); + ack.firstPacket = htonl(call->rx_data_eaten + 1); + spin_unlock_bh(&call->lock); + + pad = 0; + + iov[1].iov_base = &ack; + iov[1].iov_len = sizeof(ack); + iov[2].iov_base = &pad; + iov[2].iov_len = 3; + iov[3].iov_base = &ackinfo; + iov[3].iov_len = sizeof(ackinfo); + goto send_ACK; + } + + if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) | + (1 << RXRPC_CALL_EV_RCVD_ABORT)) + ) { + u32 mark; + + if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events)) + mark = RXRPC_SKB_MARK_REMOTE_ABORT; + else + mark = RXRPC_SKB_MARK_BUSY; + + _debug("post abort/busy"); + rxrpc_clear_tx_window(call); + if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) + goto no_mem; + + clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + goto kill_ACKs; + } + + if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) { + _debug("do implicit ackall"); + rxrpc_clear_tx_window(call); + } + + if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) { + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_CALL_TIMEOUT; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + } + write_unlock_bh(&call->state_lock); + + _debug("post timeout"); + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + ETIME, true) < 0) + goto no_mem; + + clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); + goto kill_ACKs; + } + + /* deal with assorted inbound messages */ + if (!skb_queue_empty(&call->rx_queue)) { + switch (rxrpc_process_rx_queue(call, &abort_code)) { + case 0: + case -EAGAIN: + break; + case -ENOMEM: + goto no_mem; + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EPROTO: + rxrpc_abort_call(call, abort_code); + goto kill_ACKs; + } + } + + /* handle resending */ + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) + rxrpc_resend_timer(call); + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_resend(call); + + /* consider sending an ordinary ACK */ + if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { + _debug("send ACK: window: %d - %d { %lx }", + call->rx_data_eaten, call->ackr_win_top, + call->ackr_window[0]); + + if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && + call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { + /* ACK by sending reply DATA packet in this state */ + clear_bit(RXRPC_CALL_EV_ACK, &call->events); + goto maybe_reschedule; + } + + genbit = RXRPC_CALL_EV_ACK; + + acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, + GFP_NOFS); + if (!acks) + goto no_mem; + + //hdr.flags = RXRPC_SLOW_START_OK; + ack.bufferSpace = htons(8); + ack.maxSkew = 0; + + spin_lock_bh(&call->lock); + ack.reason = call->ackr_reason; + ack.serial = htonl(call->ackr_serial); + ack.previousPacket = htonl(call->ackr_prev_seq); + ack.firstPacket = htonl(call->rx_data_eaten + 1); + + ack.nAcks = 0; + for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { + nbit = loop * BITS_PER_LONG; + for (bits = call->ackr_window[loop]; bits; bits >>= 1 + ) { + _debug("- l=%d n=%d b=%lx", loop, nbit, bits); + if (bits & 1) { + acks[nbit] = RXRPC_ACK_TYPE_ACK; + ack.nAcks = nbit + 1; + } + nbit++; + } + } + call->ackr_reason = 0; + spin_unlock_bh(&call->lock); + + pad = 0; + + iov[1].iov_base = &ack; + iov[1].iov_len = sizeof(ack); + iov[2].iov_base = acks; + iov[2].iov_len = ack.nAcks; + iov[3].iov_base = &pad; + iov[3].iov_len = 3; + iov[4].iov_base = &ackinfo; + iov[4].iov_len = sizeof(ackinfo); + + switch (ack.reason) { + case RXRPC_ACK_REQUESTED: + case RXRPC_ACK_DUPLICATE: + case RXRPC_ACK_OUT_OF_SEQUENCE: + case RXRPC_ACK_EXCEEDS_WINDOW: + case RXRPC_ACK_NOSPACE: + case RXRPC_ACK_PING: + case RXRPC_ACK_PING_RESPONSE: + goto send_ACK_with_skew; + case RXRPC_ACK_DELAY: + case RXRPC_ACK_IDLE: + goto send_ACK; + } + } + + /* handle completion of security negotiations on an incoming + * connection */ + if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) { + _debug("secured"); + spin_lock_bh(&call->lock); + + if (call->state == RXRPC_CALL_SERVER_SECURING) { + _debug("securing"); + write_lock(&call->socket->call_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { + _debug("not released"); + call->state = RXRPC_CALL_SERVER_ACCEPTING; + list_move_tail(&call->accept_link, + &call->socket->acceptq); + } + write_unlock(&call->socket->call_lock); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) + set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); + read_unlock(&call->state_lock); + } + + spin_unlock_bh(&call->lock); + if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) + goto maybe_reschedule; + } + + /* post a notification of an acceptable connection to the app */ + if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) { + _debug("post accept"); + if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, + 0, false) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); + goto maybe_reschedule; + } + + /* handle incoming call acceptance */ + if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) { + _debug("accepted"); + ASSERTCMP(call->rx_data_post, ==, 0); + call->rx_data_post = 1; + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) + set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); + read_unlock_bh(&call->state_lock); + } + + /* drain the out of sequence received packet queue into the packet Rx + * queue */ + if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) { + while (call->rx_data_post == call->rx_first_oos) + if (rxrpc_drain_rx_oos_queue(call) < 0) + break; + goto maybe_reschedule; + } + + if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { + rxrpc_release_call(call); + clear_bit(RXRPC_CALL_EV_RELEASE, &call->events); + } + + /* other events may have been raised since we started checking */ + goto maybe_reschedule; + +send_ACK_with_skew: + ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) - + ntohl(ack.serial)); +send_ACK: + mtu = call->conn->params.peer->if_mtu; + mtu -= call->conn->params.peer->hdrsize; + ackinfo.maxMTU = htonl(mtu); + ackinfo.rwind = htonl(rxrpc_rx_window_size); + + /* permit the peer to send us jumbo packets if it wants to */ + ackinfo.rxMTU = htonl(rxrpc_rx_mtu); + ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); + + serial = atomic_inc_return(&call->conn->serial); + whdr.serial = htonl(serial); + _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + serial, + ntohs(ack.maxSkew), + ntohl(ack.firstPacket), + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks(ack.reason), + ack.nAcks); + + del_timer_sync(&call->ack_timer); + if (ack.nAcks > 0) + set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags); + goto send_message_2; + +send_message: + _debug("send message"); + + serial = atomic_inc_return(&call->conn->serial); + whdr.serial = htonl(serial); + _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial); +send_message_2: + + len = iov[0].iov_len; + ioc = 1; + if (iov[4].iov_len) { + ioc = 5; + len += iov[4].iov_len; + len += iov[3].iov_len; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[3].iov_len) { + ioc = 4; + len += iov[3].iov_len; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[2].iov_len) { + ioc = 3; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[1].iov_len) { + ioc = 2; + len += iov[1].iov_len; + } + + ret = kernel_sendmsg(call->conn->params.local->socket, + &msg, iov, ioc, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + goto error; + } + + switch (genbit) { + case RXRPC_CALL_EV_ABORT: + clear_bit(genbit, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + goto kill_ACKs; + + case RXRPC_CALL_EV_ACK_FINAL: + write_lock_bh(&call->state_lock); + if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) + call->state = RXRPC_CALL_COMPLETE; + write_unlock_bh(&call->state_lock); + goto kill_ACKs; + + default: + clear_bit(genbit, &call->events); + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + case RXRPC_CALL_CLIENT_RECV_REPLY: + case RXRPC_CALL_SERVER_RECV_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + _debug("start ACK timer"); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, + call->ackr_serial, false); + default: + break; + } + goto maybe_reschedule; + } + +kill_ACKs: + del_timer_sync(&call->ack_timer); + if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) + rxrpc_put_call(call); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); + +maybe_reschedule: + if (call->events || !skb_queue_empty(&call->rx_queue)) { + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + } + + /* don't leave aborted connections on the accept queue */ + if (call->state >= RXRPC_CALL_COMPLETE && + !list_empty(&call->accept_link)) { + _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", + call, call->events, call->flags, call->conn->proto.cid); + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + } + +error: + clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags); + kfree(acks); + + /* because we don't want two CPUs both processing the work item for one + * call at the same time, we use a flag to note when it's busy; however + * this means there's a race between clearing the flag and setting the + * work pending bit and the work item being processed again */ + if (call->events && !work_pending(&call->processor)) { + _debug("jumpstart %x", call->conn->proto.cid); + rxrpc_queue_call(call); + } + + _leave(""); + return; + +no_mem: + _debug("out of memory"); + goto maybe_reschedule; +} diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c new file mode 100644 index 000000000..ae057e074 --- /dev/null +++ b/net/rxrpc/call_object.c @@ -0,0 +1,801 @@ +/* RxRPC individual remote procedure call handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * Maximum lifetime of a call (in jiffies). + */ +unsigned int rxrpc_max_call_lifetime = 60 * HZ; + +/* + * Time till dead call expires after last use (in jiffies). + */ +unsigned int rxrpc_dead_call_expiry = 2 * HZ; + +const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { + [RXRPC_CALL_UNINITIALISED] = "Uninit", + [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", + [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", + [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", + [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", + [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", + [RXRPC_CALL_SERVER_SECURING] = "SvSecure", + [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", + [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", + [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", + [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", + [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", + [RXRPC_CALL_COMPLETE] = "Complete", + [RXRPC_CALL_SERVER_BUSY] = "SvBusy ", + [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CALL_NETWORK_ERROR] = "NetError", + [RXRPC_CALL_DEAD] = "Dead ", +}; + +struct kmem_cache *rxrpc_call_jar; +LIST_HEAD(rxrpc_calls); +DEFINE_RWLOCK(rxrpc_call_lock); + +static void rxrpc_destroy_call(struct work_struct *work); +static void rxrpc_call_life_expired(unsigned long _call); +static void rxrpc_dead_call_expired(unsigned long _call); +static void rxrpc_ack_time_expired(unsigned long _call); +static void rxrpc_resend_time_expired(unsigned long _call); + +/* + * find an extant server call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p,%lx", rx, user_call_ID); + + read_lock(&rx->call_lock); + + p = rx->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + p = p->rb_left; + else if (user_call_ID > call->user_call_ID) + p = p->rb_right; + else + goto found_extant_call; + } + + read_unlock(&rx->call_lock); + _leave(" = NULL"); + return NULL; + +found_extant_call: + rxrpc_get_call(call); + read_unlock(&rx->call_lock); + _leave(" = %p [%d]", call, atomic_read(&call->usage)); + return call; +} + +/* + * allocate a new call + */ +static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +{ + struct rxrpc_call *call; + + call = kmem_cache_zalloc(rxrpc_call_jar, gfp); + if (!call) + return NULL; + + call->acks_winsz = 16; + call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long), + gfp); + if (!call->acks_window) { + kmem_cache_free(rxrpc_call_jar, call); + return NULL; + } + + setup_timer(&call->lifetimer, &rxrpc_call_life_expired, + (unsigned long) call); + setup_timer(&call->deadspan, &rxrpc_dead_call_expired, + (unsigned long) call); + setup_timer(&call->ack_timer, &rxrpc_ack_time_expired, + (unsigned long) call); + setup_timer(&call->resend_timer, &rxrpc_resend_time_expired, + (unsigned long) call); + INIT_WORK(&call->destroyer, &rxrpc_destroy_call); + INIT_WORK(&call->processor, &rxrpc_process_call); + INIT_LIST_HEAD(&call->link); + INIT_LIST_HEAD(&call->accept_link); + skb_queue_head_init(&call->rx_queue); + skb_queue_head_init(&call->rx_oos_queue); + init_waitqueue_head(&call->tx_waitq); + spin_lock_init(&call->lock); + rwlock_init(&call->state_lock); + atomic_set(&call->usage, 1); + call->debug_id = atomic_inc_return(&rxrpc_debug_id); + + memset(&call->sock_node, 0xed, sizeof(call->sock_node)); + + call->rx_data_expect = 1; + call->rx_data_eaten = 0; + call->rx_first_oos = 0; + call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size; + call->creation_jif = jiffies; + return call; +} + +/* + * Allocate a new client call. + */ +static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + struct rxrpc_call *call; + + _enter(""); + + ASSERT(rx->local != NULL); + + call = rxrpc_alloc_call(gfp); + if (!call) + return ERR_PTR(-ENOMEM); + call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; + + sock_hold(&rx->sk); + call->socket = rx; + call->rx_data_post = 1; + + call->local = rx->local; + call->service_id = srx->srx_service; + call->in_clientflag = 0; + + _leave(" = %p", call); + return call; +} + +/* + * Begin client call. + */ +static int rxrpc_begin_client_call(struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + int ret; + + /* Set up or get a connection record and set the protocol parameters, + * including channel number and call ID. + */ + ret = rxrpc_connect_call(call, cp, srx, gfp); + if (ret < 0) + return ret; + + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + + spin_lock(&call->conn->params.peer->lock); + hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets); + spin_unlock(&call->conn->params.peer->lock); + + call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; + add_timer(&call->lifetimer); + return 0; +} + +/* + * set up a call for the given data + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + unsigned long user_call_ID, + gfp_t gfp) +{ + struct rxrpc_call *call, *xcall; + struct rb_node *parent, **pp; + int ret; + + _enter("%p,%lx", rx, user_call_ID); + + call = rxrpc_alloc_client_call(rx, srx, gfp); + if (IS_ERR(call)) { + _leave(" = %ld", PTR_ERR(call)); + return call; + } + + /* Publish the call, even though it is incompletely set up as yet */ + call->user_call_ID = user_call_ID; + __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); + + write_lock(&rx->call_lock); + + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + xcall = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < xcall->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > xcall->user_call_ID) + pp = &(*pp)->rb_right; + else + goto found_user_ID_now_present; + } + + rxrpc_get_call(call); + + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + write_unlock(&rx->call_lock); + + write_lock_bh(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock_bh(&rxrpc_call_lock); + + ret = rxrpc_begin_client_call(call, cp, srx, gfp); + if (ret < 0) + goto error; + + _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); + + _leave(" = %p [new]", call); + return call; + +error: + write_lock(&rx->call_lock); + rb_erase(&call->sock_node, &rx->calls); + write_unlock(&rx->call_lock); + rxrpc_put_call(call); + + write_lock_bh(&rxrpc_call_lock); + list_del_init(&call->link); + write_unlock_bh(&rxrpc_call_lock); + + set_bit(RXRPC_CALL_RELEASED, &call->flags); + call->state = RXRPC_CALL_DEAD; + rxrpc_put_call(call); + _leave(" = %d", ret); + return ERR_PTR(ret); + + /* We unexpectedly found the user ID in the list after taking + * the call_lock. This shouldn't happen unless the user races + * with itself and tries to add the same user ID twice at the + * same time in different threads. + */ +found_user_ID_now_present: + write_unlock(&rx->call_lock); + set_bit(RXRPC_CALL_RELEASED, &call->flags); + call->state = RXRPC_CALL_DEAD; + rxrpc_put_call(call); + _leave(" = -EEXIST [%p]", call); + return ERR_PTR(-EEXIST); +} + +/* + * set up an incoming call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, + struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call, *candidate; + u32 call_id, chan; + + _enter(",%d", conn->debug_id); + + ASSERT(rx != NULL); + + candidate = rxrpc_alloc_call(GFP_NOIO); + if (!candidate) + return ERR_PTR(-EBUSY); + + chan = sp->hdr.cid & RXRPC_CHANNELMASK; + candidate->socket = rx; + candidate->conn = conn; + candidate->cid = sp->hdr.cid; + candidate->call_id = sp->hdr.callNumber; + candidate->channel = chan; + candidate->rx_data_post = 0; + candidate->state = RXRPC_CALL_SERVER_ACCEPTING; + if (conn->security_ix > 0) + candidate->state = RXRPC_CALL_SERVER_SECURING; + + spin_lock(&conn->channel_lock); + + /* set the channel for this call */ + call = rcu_dereference_protected(conn->channels[chan].call, + lockdep_is_held(&conn->channel_lock)); + + _debug("channel[%u] is %p", candidate->channel, call); + if (call && call->call_id == sp->hdr.callNumber) { + /* already set; must've been a duplicate packet */ + _debug("extant call [%d]", call->state); + ASSERTCMP(call->conn, ==, conn); + + read_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_LOCALLY_ABORTED: + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) + rxrpc_queue_call(call); + case RXRPC_CALL_REMOTELY_ABORTED: + read_unlock(&call->state_lock); + goto aborted_call; + default: + rxrpc_get_call(call); + read_unlock(&call->state_lock); + goto extant_call; + } + } + + if (call) { + /* it seems the channel is still in use from the previous call + * - ditch the old binding if its call is now complete */ + _debug("CALL: %u { %s }", + call->debug_id, rxrpc_call_states[call->state]); + + if (call->state >= RXRPC_CALL_COMPLETE) { + __rxrpc_disconnect_call(call); + } else { + spin_unlock(&conn->channel_lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -EBUSY"); + return ERR_PTR(-EBUSY); + } + } + + /* check the call number isn't duplicate */ + _debug("check dup"); + call_id = sp->hdr.callNumber; + + /* We just ignore calls prior to the current call ID. Terminated calls + * are handled via the connection. + */ + if (call_id <= conn->channels[chan].call_counter) + goto old_call; /* TODO: Just drop packet */ + + /* make the call available */ + _debug("new call"); + call = candidate; + candidate = NULL; + conn->channels[chan].call_counter = call_id; + rcu_assign_pointer(conn->channels[chan].call, call); + sock_hold(&rx->sk); + rxrpc_get_connection(conn); + spin_unlock(&conn->channel_lock); + + spin_lock(&conn->params.peer->lock); + hlist_add_head(&call->error_link, &conn->params.peer->error_targets); + spin_unlock(&conn->params.peer->lock); + + write_lock_bh(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock_bh(&rxrpc_call_lock); + + call->local = conn->params.local; + call->epoch = conn->proto.epoch; + call->service_id = conn->params.service_id; + call->in_clientflag = RXRPC_CLIENT_INITIATED; + + _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); + + call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; + add_timer(&call->lifetimer); + _leave(" = %p {%d} [new]", call, call->debug_id); + return call; + +extant_call: + spin_unlock(&conn->channel_lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); + return call; + +aborted_call: + spin_unlock(&conn->channel_lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -ECONNABORTED"); + return ERR_PTR(-ECONNABORTED); + +old_call: + spin_unlock(&conn->channel_lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -ECONNRESET [old]"); + return ERR_PTR(-ECONNRESET); +} + +/* + * detach a call from a socket and set up for release + */ +void rxrpc_release_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + struct rxrpc_sock *rx = call->socket; + + _enter("{%d,%d,%d,%d}", + call->debug_id, atomic_read(&call->usage), + atomic_read(&call->ackr_not_idle), + call->rx_first_oos); + + spin_lock_bh(&call->lock); + if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) + BUG(); + spin_unlock_bh(&call->lock); + + /* dissociate from the socket + * - the socket's ref on the call is passed to the death timer + */ + _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); + + spin_lock(&conn->params.peer->lock); + hlist_del_init(&call->error_link); + spin_unlock(&conn->params.peer->lock); + + write_lock_bh(&rx->call_lock); + if (!list_empty(&call->accept_link)) { + _debug("unlinking once-pending call %p { e=%lx f=%lx }", + call, call->events, call->flags); + ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + rb_erase(&call->sock_node, &rx->calls); + memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); + clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + } + write_unlock_bh(&rx->call_lock); + + /* free up the channel for reuse */ + write_lock_bh(&call->state_lock); + + if (call->state < RXRPC_CALL_COMPLETE && + call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { + _debug("+++ ABORTING STATE %d +++\n", call->state); + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_CALL_DEAD; + } + write_unlock_bh(&call->state_lock); + + rxrpc_disconnect_call(call); + + /* clean up the Rx queue */ + if (!skb_queue_empty(&call->rx_queue) || + !skb_queue_empty(&call->rx_oos_queue)) { + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + + _debug("purge Rx queues"); + + spin_lock_bh(&call->lock); + while ((skb = skb_dequeue(&call->rx_queue)) || + (skb = skb_dequeue(&call->rx_oos_queue))) { + spin_unlock_bh(&call->lock); + + sp = rxrpc_skb(skb); + _debug("- zap %s %%%u #%u", + rxrpc_pkts[sp->hdr.type], + sp->hdr.serial, sp->hdr.seq); + rxrpc_free_skb(skb); + spin_lock_bh(&call->lock); + } + spin_unlock_bh(&call->lock); + + ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE); + } + + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + del_timer_sync(&call->lifetimer); + call->deadspan.expires = jiffies + rxrpc_dead_call_expiry; + add_timer(&call->deadspan); + + _leave(""); +} + +/* + * handle a dead call being ready for reaping + */ +static void rxrpc_dead_call_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + write_lock_bh(&call->state_lock); + call->state = RXRPC_CALL_DEAD; + write_unlock_bh(&call->state_lock); + rxrpc_put_call(call); +} + +/* + * mark a call as to be released, aborting it if it's still in progress + * - called with softirqs disabled + */ +static void rxrpc_mark_call_released(struct rxrpc_call *call) +{ + bool sched; + + write_lock(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) { + sched = false; + if (call->state < RXRPC_CALL_COMPLETE) { + _debug("abort call %p", call); + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_CALL_DEAD; + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) + sched = true; + } + if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + sched = true; + if (sched) + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); +} + +/* + * release all the calls associated with a socket + */ +void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p", rx); + + read_lock_bh(&rx->call_lock); + + /* mark all the calls as no longer wanting incoming packets */ + for (p = rb_first(&rx->calls); p; p = rb_next(p)) { + call = rb_entry(p, struct rxrpc_call, sock_node); + rxrpc_mark_call_released(call); + } + + /* kill the not-yet-accepted incoming calls */ + list_for_each_entry(call, &rx->secureq, accept_link) { + rxrpc_mark_call_released(call); + } + + list_for_each_entry(call, &rx->acceptq, accept_link) { + rxrpc_mark_call_released(call); + } + + read_unlock_bh(&rx->call_lock); + _leave(""); +} + +/* + * release a call + */ +void __rxrpc_put_call(struct rxrpc_call *call) +{ + ASSERT(call != NULL); + + _enter("%p{u=%d}", call, atomic_read(&call->usage)); + + ASSERTCMP(atomic_read(&call->usage), >, 0); + + if (atomic_dec_and_test(&call->usage)) { + _debug("call %d dead", call->debug_id); + WARN_ON(atomic_read(&call->skb_count) != 0); + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + rxrpc_queue_work(&call->destroyer); + } + _leave(""); +} + +/* + * Final call destruction under RCU. + */ +static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) +{ + struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + + rxrpc_purge_queue(&call->rx_queue); + kmem_cache_free(rxrpc_call_jar, call); +} + +/* + * clean up a call + */ +static void rxrpc_cleanup_call(struct rxrpc_call *call) +{ + _net("DESTROY CALL %d", call->debug_id); + + ASSERT(call->socket); + + memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); + + del_timer_sync(&call->lifetimer); + del_timer_sync(&call->deadspan); + del_timer_sync(&call->ack_timer); + del_timer_sync(&call->resend_timer); + + ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); + ASSERTCMP(call->events, ==, 0); + if (work_pending(&call->processor)) { + _debug("defer destroy"); + rxrpc_queue_work(&call->destroyer); + return; + } + + ASSERTCMP(call->conn, ==, NULL); + + if (call->acks_window) { + _debug("kill Tx window %d", + CIRC_CNT(call->acks_head, call->acks_tail, + call->acks_winsz)); + smp_mb(); + while (CIRC_CNT(call->acks_head, call->acks_tail, + call->acks_winsz) > 0) { + struct rxrpc_skb_priv *sp; + unsigned long _skb; + + _skb = call->acks_window[call->acks_tail] & ~1; + sp = rxrpc_skb((struct sk_buff *)_skb); + _debug("+++ clear Tx %u", sp->hdr.seq); + rxrpc_free_skb((struct sk_buff *)_skb); + call->acks_tail = + (call->acks_tail + 1) & (call->acks_winsz - 1); + } + + kfree(call->acks_window); + } + + rxrpc_free_skb(call->tx_pending); + + rxrpc_purge_queue(&call->rx_queue); + ASSERT(skb_queue_empty(&call->rx_oos_queue)); + sock_put(&call->socket->sk); + call_rcu(&call->rcu, rxrpc_rcu_destroy_call); +} + +/* + * destroy a call + */ +static void rxrpc_destroy_call(struct work_struct *work) +{ + struct rxrpc_call *call = + container_of(work, struct rxrpc_call, destroyer); + + _enter("%p{%d,%d,%p}", + call, atomic_read(&call->usage), call->channel, call->conn); + + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + + write_lock_bh(&rxrpc_call_lock); + list_del_init(&call->link); + write_unlock_bh(&rxrpc_call_lock); + + rxrpc_cleanup_call(call); + _leave(""); +} + +/* + * preemptively destroy all the call records from a transport endpoint rather + * than waiting for them to time out + */ +void __exit rxrpc_destroy_all_calls(void) +{ + struct rxrpc_call *call; + + _enter(""); + write_lock_bh(&rxrpc_call_lock); + + while (!list_empty(&rxrpc_calls)) { + call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); + _debug("Zapping call %p", call); + + list_del_init(&call->link); + + switch (atomic_read(&call->usage)) { + case 0: + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + break; + case 1: + if (del_timer_sync(&call->deadspan) != 0 && + call->state != RXRPC_CALL_DEAD) + rxrpc_dead_call_expired((unsigned long) call); + if (call->state != RXRPC_CALL_DEAD) + break; + default: + pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n", + call, atomic_read(&call->usage), + atomic_read(&call->ackr_not_idle), + rxrpc_call_states[call->state], + call->flags, call->events); + if (!skb_queue_empty(&call->rx_queue)) + pr_err("Rx queue occupied\n"); + if (!skb_queue_empty(&call->rx_oos_queue)) + pr_err("OOS queue occupied\n"); + break; + } + + write_unlock_bh(&rxrpc_call_lock); + cond_resched(); + write_lock_bh(&rxrpc_call_lock); + } + + write_unlock_bh(&rxrpc_call_lock); + _leave(""); +} + +/* + * handle call lifetime being exceeded + */ +static void rxrpc_call_life_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + _enter("{%d}", call->debug_id); + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); +} + +/* + * handle resend timer expiry + * - may not take call->state_lock as this can deadlock against del_timer_sync() + */ +static void rxrpc_resend_time_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) + rxrpc_queue_call(call); +} + +/* + * handle ACK timer expiry + */ +static void rxrpc_ack_time_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c new file mode 100644 index 000000000..9e91f27b0 --- /dev/null +++ b/net/rxrpc/conn_client.c @@ -0,0 +1,372 @@ +/* Client connection-specific management code. + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include "ar-internal.h" + +/* + * We use machine-unique IDs for our client connections. + */ +DEFINE_IDR(rxrpc_client_conn_ids); +static DEFINE_SPINLOCK(rxrpc_conn_id_lock); + +/* + * Get a connection ID and epoch for a client connection from the global pool. + * The connection struct pointer is then recorded in the idr radix tree. The + * epoch is changed if this wraps. + * + * TODO: The IDR tree gets very expensive on memory if the connection IDs are + * widely scattered throughout the number space, so we shall need to retire + * connections that have, say, an ID more than four times the maximum number of + * client conns away from the current allocation point to try and keep the IDs + * concentrated. We will also need to retire connections from an old epoch. + */ +static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, + gfp_t gfp) +{ + u32 epoch; + int id; + + _enter(""); + + idr_preload(gfp); + spin_lock(&rxrpc_conn_id_lock); + + epoch = rxrpc_epoch; + + /* We could use idr_alloc_cyclic() here, but we really need to know + * when the thing wraps so that we can advance the epoch. + */ + if (rxrpc_client_conn_ids.cur == 0) + rxrpc_client_conn_ids.cur = 1; + id = idr_alloc(&rxrpc_client_conn_ids, conn, + rxrpc_client_conn_ids.cur, 0x40000000, GFP_NOWAIT); + if (id < 0) { + if (id != -ENOSPC) + goto error; + id = idr_alloc(&rxrpc_client_conn_ids, conn, + 1, 0x40000000, GFP_NOWAIT); + if (id < 0) + goto error; + epoch++; + rxrpc_epoch = epoch; + } + rxrpc_client_conn_ids.cur = id + 1; + + spin_unlock(&rxrpc_conn_id_lock); + idr_preload_end(); + + conn->proto.epoch = epoch; + conn->proto.cid = id << RXRPC_CIDSHIFT; + set_bit(RXRPC_CONN_HAS_IDR, &conn->flags); + _leave(" [CID %x:%x]", epoch, conn->proto.cid); + return 0; + +error: + spin_unlock(&rxrpc_conn_id_lock); + idr_preload_end(); + _leave(" = %d", id); + return id; +} + +/* + * Release a connection ID for a client connection from the global pool. + */ +static void rxrpc_put_client_connection_id(struct rxrpc_connection *conn) +{ + if (test_bit(RXRPC_CONN_HAS_IDR, &conn->flags)) { + spin_lock(&rxrpc_conn_id_lock); + idr_remove(&rxrpc_client_conn_ids, + conn->proto.cid >> RXRPC_CIDSHIFT); + spin_unlock(&rxrpc_conn_id_lock); + } +} + +/* + * Destroy the client connection ID tree. + */ +void rxrpc_destroy_client_conn_ids(void) +{ + struct rxrpc_connection *conn; + int id; + + if (!idr_is_empty(&rxrpc_client_conn_ids)) { + idr_for_each_entry(&rxrpc_client_conn_ids, conn, id) { + pr_err("AF_RXRPC: Leaked client conn %p {%d}\n", + conn, atomic_read(&conn->usage)); + } + BUG(); + } + + idr_destroy(&rxrpc_client_conn_ids); +} + +/* + * Allocate a client connection. The caller must take care to clear any + * padding bytes in *cp. + */ +static struct rxrpc_connection * +rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) +{ + struct rxrpc_connection *conn; + int ret; + + _enter(""); + + conn = rxrpc_alloc_connection(gfp); + if (!conn) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + conn->params = *cp; + conn->out_clientflag = RXRPC_CLIENT_INITIATED; + conn->state = RXRPC_CONN_CLIENT; + + ret = rxrpc_get_client_connection_id(conn, gfp); + if (ret < 0) + goto error_0; + + ret = rxrpc_init_client_conn_security(conn); + if (ret < 0) + goto error_1; + + ret = conn->security->prime_packet_security(conn); + if (ret < 0) + goto error_2; + + write_lock(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock(&rxrpc_connection_lock); + + /* We steal the caller's peer ref. */ + cp->peer = NULL; + rxrpc_get_local(conn->params.local); + key_get(conn->params.key); + + _leave(" = %p", conn); + return conn; + +error_2: + conn->security->clear(conn); +error_1: + rxrpc_put_client_connection_id(conn); +error_0: + kfree(conn); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * find a connection for a call + * - called in process context with IRQs enabled + */ +int rxrpc_connect_call(struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + struct rxrpc_connection *conn, *candidate = NULL; + struct rxrpc_local *local = cp->local; + struct rb_node *p, **pp, *parent; + long diff; + int chan; + + DECLARE_WAITQUEUE(myself, current); + + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); + + cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); + if (!cp->peer) + return -ENOMEM; + + if (!cp->exclusive) { + /* Search for a existing client connection unless this is going + * to be a connection that's used exclusively for a single call. + */ + _debug("search 1"); + spin_lock(&local->client_conns_lock); + p = local->client_conns.rb_node; + while (p) { + conn = rb_entry(p, struct rxrpc_connection, client_node); + +#define cmp(X) ((long)conn->params.X - (long)cp->X) + diff = (cmp(peer) ?: + cmp(key) ?: + cmp(security_level)); + if (diff < 0) + p = p->rb_left; + else if (diff > 0) + p = p->rb_right; + else + goto found_extant_conn; + } + spin_unlock(&local->client_conns_lock); + } + + /* We didn't find a connection or we want an exclusive one. */ + _debug("get new conn"); + candidate = rxrpc_alloc_client_connection(cp, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + if (cp->exclusive) { + /* Assign the call on an exclusive connection to channel 0 and + * don't add the connection to the endpoint's shareable conn + * lookup tree. + */ + _debug("exclusive chan 0"); + conn = candidate; + atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); + spin_lock(&conn->channel_lock); + chan = 0; + goto found_channel; + } + + /* We need to redo the search before attempting to add a new connection + * lest we race with someone else adding a conflicting instance. + */ + _debug("search 2"); + spin_lock(&local->client_conns_lock); + + pp = &local->client_conns.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + conn = rb_entry(parent, struct rxrpc_connection, client_node); + + diff = (cmp(peer) ?: + cmp(key) ?: + cmp(security_level)); + if (diff < 0) + pp = &(*pp)->rb_left; + else if (diff > 0) + pp = &(*pp)->rb_right; + else + goto found_extant_conn; + } + + /* The second search also failed; simply add the new connection with + * the new call in channel 0. Note that we need to take the channel + * lock before dropping the client conn lock. + */ + _debug("new conn"); + set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); + rb_link_node(&candidate->client_node, parent, pp); + rb_insert_color(&candidate->client_node, &local->client_conns); +attached: + conn = candidate; + candidate = NULL; + + atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); + spin_lock(&conn->channel_lock); + spin_unlock(&local->client_conns_lock); + chan = 0; + +found_channel: + _debug("found chan"); + call->conn = conn; + call->channel = chan; + call->epoch = conn->proto.epoch; + call->cid = conn->proto.cid | chan; + call->call_id = ++conn->channels[chan].call_counter; + conn->channels[chan].call_id = call->call_id; + rcu_assign_pointer(conn->channels[chan].call, call); + + _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id); + + spin_unlock(&conn->channel_lock); + rxrpc_put_peer(cp->peer); + cp->peer = NULL; + _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); + return 0; + + /* We found a potentially suitable connection already in existence. If + * we can reuse it (ie. its usage count hasn't been reduced to 0 by the + * reaper), discard any candidate we may have allocated, and try to get + * a channel on this one, otherwise we have to replace it. + */ +found_extant_conn: + _debug("found conn"); + if (!rxrpc_get_connection_maybe(conn)) { + set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); + rb_replace_node(&conn->client_node, + &candidate->client_node, + &local->client_conns); + clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags); + goto attached; + } + + spin_unlock(&local->client_conns_lock); + + rxrpc_put_connection(candidate); + + if (!atomic_add_unless(&conn->avail_chans, -1, 0)) { + if (!gfpflags_allow_blocking(gfp)) { + rxrpc_put_connection(conn); + _leave(" = -EAGAIN"); + return -EAGAIN; + } + + add_wait_queue(&conn->channel_wq, &myself); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_add_unless(&conn->avail_chans, -1, 0)) + break; + if (signal_pending(current)) + goto interrupted; + schedule(); + } + remove_wait_queue(&conn->channel_wq, &myself); + __set_current_state(TASK_RUNNING); + } + + /* The connection allegedly now has a free channel and we can now + * attach the call to it. + */ + spin_lock(&conn->channel_lock); + + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) + if (!conn->channels[chan].call) + goto found_channel; + BUG(); + +interrupted: + remove_wait_queue(&conn->channel_wq, &myself); + __set_current_state(TASK_RUNNING); + rxrpc_put_connection(conn); + rxrpc_put_peer(cp->peer); + cp->peer = NULL; + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; +} + +/* + * Remove a client connection from the local endpoint's tree, thereby removing + * it as a target for reuse for new client calls. + */ +void rxrpc_unpublish_client_conn(struct rxrpc_connection *conn) +{ + struct rxrpc_local *local = conn->params.local; + + spin_lock(&local->client_conns_lock); + if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) + rb_erase(&conn->client_node, &local->client_conns); + spin_unlock(&local->client_conns_lock); + + rxrpc_put_client_connection_id(conn); +} diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c new file mode 100644 index 000000000..cee0f35bc --- /dev/null +++ b/net/rxrpc/conn_event.c @@ -0,0 +1,394 @@ +/* connection-level event handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * pass a connection-level abort onto all calls on that connection + */ +static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, + u32 abort_code) +{ + struct rxrpc_call *call; + int i; + + _enter("{%d},%x", conn->debug_id, abort_code); + + spin_lock(&conn->channel_lock); + + for (i = 0; i < RXRPC_MAXCALLS; i++) { + call = rcu_dereference_protected( + conn->channels[i].call, + lockdep_is_held(&conn->channel_lock)); + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = state; + if (state == RXRPC_CALL_LOCALLY_ABORTED) { + call->local_abort = conn->local_abort; + set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); + } else { + call->remote_abort = conn->remote_abort; + set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + } + rxrpc_queue_call(call); + } + write_unlock_bh(&call->state_lock); + } + + spin_unlock(&conn->channel_lock); + _leave(""); +} + +/* + * generate a connection-level abort + */ +static int rxrpc_abort_connection(struct rxrpc_connection *conn, + u32 error, u32 abort_code) +{ + struct rxrpc_wire_header whdr; + struct msghdr msg; + struct kvec iov[2]; + __be32 word; + size_t len; + u32 serial; + int ret; + + _enter("%d,,%u,%u", conn->debug_id, error, abort_code); + + /* generate a connection-level abort */ + spin_lock_bh(&conn->state_lock); + if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) { + conn->state = RXRPC_CONN_LOCALLY_ABORTED; + conn->error = error; + spin_unlock_bh(&conn->state_lock); + } else { + spin_unlock_bh(&conn->state_lock); + _leave(" = 0 [already dead]"); + return 0; + } + + rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); + + msg.msg_name = &conn->params.peer->srx.transport; + msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(conn->proto.epoch); + whdr.cid = htonl(conn->proto.cid); + whdr.callNumber = 0; + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ABORT; + whdr.flags = conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(conn->params.service_id); + + word = htonl(conn->local_abort); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = &word; + iov[1].iov_len = sizeof(word); + + len = iov[0].iov_len + iov[1].iov_len; + + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort); + + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * mark a call as being on a now-secured channel + * - must be called with softirqs disabled + */ +static void rxrpc_call_is_secure(struct rxrpc_call *call) +{ + _enter("%p", call); + if (call) { + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events)) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + } +} + +/* + * connection-level Rx packet processor + */ +static int rxrpc_process_event(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 wtmp; + u32 abort_code; + int loop, ret; + + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { + kleave(" = -ECONNABORTED [%u]", conn->state); + return -ECONNABORTED; + } + + _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_ABORT: + if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) + return -EPROTO; + abort_code = ntohl(wtmp); + _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); + + conn->state = RXRPC_CONN_REMOTELY_ABORTED; + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, + abort_code); + return -ECONNABORTED; + + case RXRPC_PACKET_TYPE_CHALLENGE: + return conn->security->respond_to_challenge(conn, skb, + _abort_code); + + case RXRPC_PACKET_TYPE_RESPONSE: + ret = conn->security->verify_response(conn, skb, _abort_code); + if (ret < 0) + return ret; + + ret = conn->security->init_connection_security(conn); + if (ret < 0) + return ret; + + ret = conn->security->prime_packet_security(conn); + if (ret < 0) + return ret; + + spin_lock(&conn->channel_lock); + spin_lock(&conn->state_lock); + + if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { + conn->state = RXRPC_CONN_SERVICE; + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure( + rcu_dereference_protected( + conn->channels[loop].call, + lockdep_is_held(&conn->channel_lock))); + } + + spin_unlock(&conn->state_lock); + spin_unlock(&conn->channel_lock); + return 0; + + default: + _leave(" = -EPROTO [%u]", sp->hdr.type); + return -EPROTO; + } +} + +/* + * set up security and issue a challenge + */ +static void rxrpc_secure_connection(struct rxrpc_connection *conn) +{ + u32 abort_code; + int ret; + + _enter("{%d}", conn->debug_id); + + ASSERT(conn->security_ix != 0); + + if (!conn->params.key) { + _debug("set up security"); + ret = rxrpc_init_server_conn_security(conn); + switch (ret) { + case 0: + break; + case -ENOENT: + abort_code = RX_CALL_DEAD; + goto abort; + default: + abort_code = RXKADNOAUTH; + goto abort; + } + } + + if (conn->security->issue_challenge(conn) < 0) { + abort_code = RX_CALL_DEAD; + ret = -ENOMEM; + goto abort; + } + + _leave(""); + return; + +abort: + _debug("abort %d, %d", ret, abort_code); + rxrpc_abort_connection(conn, -ret, abort_code); + _leave(" [aborted]"); +} + +/* + * connection-level event processor + */ +void rxrpc_process_connection(struct work_struct *work) +{ + struct rxrpc_connection *conn = + container_of(work, struct rxrpc_connection, processor); + struct sk_buff *skb; + u32 abort_code = RX_PROTOCOL_ERROR; + int ret; + + _enter("{%d}", conn->debug_id); + + if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) + rxrpc_secure_connection(conn); + + /* go through the conn-level event packets, releasing the ref on this + * connection that each one has when we've finished with it */ + while ((skb = skb_dequeue(&conn->rx_queue))) { + ret = rxrpc_process_event(conn, skb, &abort_code); + switch (ret) { + case -EPROTO: + case -EKEYEXPIRED: + case -EKEYREJECTED: + goto protocol_error; + case -EAGAIN: + goto requeue_and_leave; + case -ECONNABORTED: + default: + rxrpc_free_skb(skb); + break; + } + } + +out: + rxrpc_put_connection(conn); + _leave(""); + return; + +requeue_and_leave: + skb_queue_head(&conn->rx_queue, skb); + goto out; + +protocol_error: + if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) + goto requeue_and_leave; + rxrpc_free_skb(skb); + _leave(" [EPROTO]"); + goto out; +} + +/* + * put a packet up for transport-level abort + */ +void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) +{ + CHECK_SLAB_OKAY(&local->usage); + + skb_queue_tail(&local->reject_queue, skb); + rxrpc_queue_local(local); +} + +/* + * reject packets through the local endpoint + */ +void rxrpc_reject_packets(struct rxrpc_local *local) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + } sa; + struct rxrpc_skb_priv *sp; + struct rxrpc_wire_header whdr; + struct sk_buff *skb; + struct msghdr msg; + struct kvec iov[2]; + size_t size; + __be32 code; + + _enter("%d", local->debug_id); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = &code; + iov[1].iov_len = sizeof(code); + size = sizeof(whdr) + sizeof(code); + + msg.msg_name = &sa; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + memset(&sa, 0, sizeof(sa)); + sa.sa.sa_family = local->srx.transport.family; + switch (sa.sa.sa_family) { + case AF_INET: + msg.msg_namelen = sizeof(sa.sin); + break; + default: + msg.msg_namelen = 0; + break; + } + + memset(&whdr, 0, sizeof(whdr)); + whdr.type = RXRPC_PACKET_TYPE_ABORT; + + while ((skb = skb_dequeue(&local->reject_queue))) { + sp = rxrpc_skb(skb); + switch (sa.sa.sa_family) { + case AF_INET: + sa.sin.sin_port = udp_hdr(skb)->source; + sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + code = htonl(skb->priority); + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.serviceId = htons(sp->hdr.serviceId); + whdr.flags = sp->hdr.flags; + whdr.flags ^= RXRPC_CLIENT_INITIATED; + whdr.flags &= RXRPC_CLIENT_INITIATED; + + kernel_sendmsg(local->socket, &msg, iov, 2, size); + break; + + default: + break; + } + + rxrpc_free_skb(skb); + } + + _leave(""); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c new file mode 100644 index 000000000..896d84493 --- /dev/null +++ b/net/rxrpc/conn_object.c @@ -0,0 +1,340 @@ +/* RxRPC virtual connection handler + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * Time till a connection expires after last use (in seconds). + */ +unsigned int rxrpc_connection_expiry = 10 * 60; + +static void rxrpc_connection_reaper(struct work_struct *work); + +LIST_HEAD(rxrpc_connections); +DEFINE_RWLOCK(rxrpc_connection_lock); +static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); + +/* + * allocate a new connection + */ +struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) +{ + struct rxrpc_connection *conn; + + _enter(""); + + conn = kzalloc(sizeof(struct rxrpc_connection), gfp); + if (conn) { + spin_lock_init(&conn->channel_lock); + init_waitqueue_head(&conn->channel_wq); + INIT_WORK(&conn->processor, &rxrpc_process_connection); + INIT_LIST_HEAD(&conn->link); + skb_queue_head_init(&conn->rx_queue); + conn->security = &rxrpc_no_security; + spin_lock_init(&conn->state_lock); + /* We maintain an extra ref on the connection whilst it is + * on the rxrpc_connections list. + */ + atomic_set(&conn->usage, 2); + conn->debug_id = atomic_inc_return(&rxrpc_debug_id); + atomic_set(&conn->avail_chans, RXRPC_MAXCALLS); + conn->size_align = 4; + conn->header_size = sizeof(struct rxrpc_wire_header); + } + + _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); + return conn; +} + +/* + * Look up a connection in the cache by protocol parameters. + * + * If successful, a pointer to the connection is returned, but no ref is taken. + * NULL is returned if there is no match. + * + * The caller must be holding the RCU read lock. + */ +struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, + struct sk_buff *skb) +{ + struct rxrpc_connection *conn; + struct rxrpc_conn_proto k; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; + + _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); + + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) + goto not_found; + + k.epoch = sp->hdr.epoch; + k.cid = sp->hdr.cid & RXRPC_CIDMASK; + + /* We may have to handle mixing IPv4 and IPv6 */ + if (srx.transport.family != local->srx.transport.family) { + pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", + srx.transport.family, + local->srx.transport.family); + goto not_found; + } + + k.epoch = sp->hdr.epoch; + k.cid = sp->hdr.cid & RXRPC_CIDMASK; + + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { + /* We need to look up service connections by the full protocol + * parameter set. We look up the peer first as an intermediate + * step and then the connection from the peer's tree. + */ + peer = rxrpc_lookup_peer_rcu(local, &srx); + if (!peer) + goto not_found; + conn = rxrpc_find_service_conn_rcu(peer, skb); + if (!conn || atomic_read(&conn->usage) == 0) + goto not_found; + _leave(" = %p", conn); + return conn; + } else { + /* Look up client connections by connection ID alone as their + * IDs are unique for this machine. + */ + conn = idr_find(&rxrpc_client_conn_ids, + sp->hdr.cid >> RXRPC_CIDSHIFT); + if (!conn || atomic_read(&conn->usage) == 0) { + _debug("no conn"); + goto not_found; + } + + if (conn->proto.epoch != k.epoch || + conn->params.local != local) + goto not_found; + + peer = conn->params.peer; + switch (srx.transport.family) { + case AF_INET: + if (peer->srx.transport.sin.sin_port != + srx.transport.sin.sin_port || + peer->srx.transport.sin.sin_addr.s_addr != + srx.transport.sin.sin_addr.s_addr) + goto not_found; + break; + default: + BUG(); + } + + _leave(" = %p", conn); + return conn; + } + +not_found: + _leave(" = NULL"); + return NULL; +} + +/* + * Disconnect a call and clear any channel it occupies when that call + * terminates. The caller must hold the channel_lock and must release the + * call's ref on the connection. + */ +void __rxrpc_disconnect_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + struct rxrpc_channel *chan = &conn->channels[call->channel]; + + _enter("%d,%d", conn->debug_id, call->channel); + + if (rcu_access_pointer(chan->call) == call) { + /* Save the result of the call so that we can repeat it if necessary + * through the channel, whilst disposing of the actual call record. + */ + chan->last_result = call->local_abort; + smp_wmb(); + chan->last_call = chan->call_id; + chan->call_id = chan->call_counter; + + rcu_assign_pointer(chan->call, NULL); + atomic_inc(&conn->avail_chans); + wake_up(&conn->channel_wq); + } + + _leave(""); +} + +/* + * Disconnect a call and clear any channel it occupies when that call + * terminates. + */ +void rxrpc_disconnect_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + + spin_lock(&conn->channel_lock); + __rxrpc_disconnect_call(call); + spin_unlock(&conn->channel_lock); + + call->conn = NULL; + rxrpc_put_connection(conn); +} + +/* + * release a virtual connection + */ +void rxrpc_put_connection(struct rxrpc_connection *conn) +{ + if (!conn) + return; + + _enter("%p{u=%d,d=%d}", + conn, atomic_read(&conn->usage), conn->debug_id); + + ASSERTCMP(atomic_read(&conn->usage), >, 1); + + conn->put_time = ktime_get_seconds(); + if (atomic_dec_return(&conn->usage) == 1) { + _debug("zombie"); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + } + + _leave(""); +} + +/* + * destroy a virtual connection + */ +static void rxrpc_destroy_connection(struct rcu_head *rcu) +{ + struct rxrpc_connection *conn = + container_of(rcu, struct rxrpc_connection, rcu); + + _enter("{%d,u=%d}", conn->debug_id, atomic_read(&conn->usage)); + + ASSERTCMP(atomic_read(&conn->usage), ==, 0); + + _net("DESTROY CONN %d", conn->debug_id); + + rxrpc_purge_queue(&conn->rx_queue); + + conn->security->clear(conn); + key_put(conn->params.key); + key_put(conn->server_key); + rxrpc_put_peer(conn->params.peer); + rxrpc_put_local(conn->params.local); + + kfree(conn); + _leave(""); +} + +/* + * reap dead connections + */ +static void rxrpc_connection_reaper(struct work_struct *work) +{ + struct rxrpc_connection *conn, *_p; + unsigned long reap_older_than, earliest, put_time, now; + + LIST_HEAD(graveyard); + + _enter(""); + + now = ktime_get_seconds(); + reap_older_than = now - rxrpc_connection_expiry; + earliest = ULONG_MAX; + + write_lock(&rxrpc_connection_lock); + list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { + ASSERTCMP(atomic_read(&conn->usage), >, 0); + if (likely(atomic_read(&conn->usage) > 1)) + continue; + + put_time = READ_ONCE(conn->put_time); + if (time_after(put_time, reap_older_than)) { + if (time_before(put_time, earliest)) + earliest = put_time; + continue; + } + + /* The usage count sits at 1 whilst the object is unused on the + * list; we reduce that to 0 to make the object unavailable. + */ + if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) + continue; + + if (rxrpc_conn_is_client(conn)) + rxrpc_unpublish_client_conn(conn); + else + rxrpc_unpublish_service_conn(conn); + + list_move_tail(&conn->link, &graveyard); + } + write_unlock(&rxrpc_connection_lock); + + if (earliest != ULONG_MAX) { + _debug("reschedule reaper %ld", (long) earliest - now); + ASSERTCMP(earliest, >, now); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, + (earliest - now) * HZ); + } + + while (!list_empty(&graveyard)) { + conn = list_entry(graveyard.next, struct rxrpc_connection, + link); + list_del_init(&conn->link); + + ASSERTCMP(atomic_read(&conn->usage), ==, 0); + skb_queue_purge(&conn->rx_queue); + call_rcu(&conn->rcu, rxrpc_destroy_connection); + } + + _leave(""); +} + +/* + * preemptively destroy all the connection records rather than waiting for them + * to time out + */ +void __exit rxrpc_destroy_all_connections(void) +{ + struct rxrpc_connection *conn, *_p; + bool leak = false; + + _enter(""); + + rxrpc_connection_expiry = 0; + cancel_delayed_work(&rxrpc_connection_reap); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + flush_workqueue(rxrpc_workqueue); + + write_lock(&rxrpc_connection_lock); + list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { + pr_err("AF_RXRPC: Leaked conn %p {%d}\n", + conn, atomic_read(&conn->usage)); + leak = true; + } + write_unlock(&rxrpc_connection_lock); + BUG_ON(leak); + + /* Make sure the local and peer records pinned by any dying connections + * are released. + */ + rcu_barrier(); + rxrpc_destroy_client_conn_ids(); + + _leave(""); +} diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c new file mode 100644 index 000000000..fd9027ccb --- /dev/null +++ b/net/rxrpc/conn_service.c @@ -0,0 +1,230 @@ +/* Service connection management + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include "ar-internal.h" + +/* + * Find a service connection under RCU conditions. + * + * We could use a hash table, but that is subject to bucket stuffing by an + * attacker as the client gets to pick the epoch and cid values and would know + * the hash function. So, instead, we use a hash table for the peer and from + * that an rbtree to find the service connection. Under ordinary circumstances + * it might be slower than a large hash table, but it is at least limited in + * depth. + */ +struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *peer, + struct sk_buff *skb) +{ + struct rxrpc_connection *conn = NULL; + struct rxrpc_conn_proto k; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rb_node *p; + unsigned int seq = 0; + + k.epoch = sp->hdr.epoch; + k.cid = sp->hdr.cid & RXRPC_CIDMASK; + + do { + /* Unfortunately, rbtree walking doesn't give reliable results + * under just the RCU read lock, so we have to check for + * changes. + */ + read_seqbegin_or_lock(&peer->service_conn_lock, &seq); + + p = rcu_dereference_raw(peer->service_conns.rb_node); + while (p) { + conn = rb_entry(p, struct rxrpc_connection, service_node); + + if (conn->proto.index_key < k.index_key) + p = rcu_dereference_raw(p->rb_left); + else if (conn->proto.index_key > k.index_key) + p = rcu_dereference_raw(p->rb_right); + else + goto done; + conn = NULL; + } + } while (need_seqretry(&peer->service_conn_lock, seq)); + +done: + done_seqretry(&peer->service_conn_lock, seq); + _leave(" = %d", conn ? conn->debug_id : -1); + return conn; +} + +/* + * Insert a service connection into a peer's tree, thereby making it a target + * for incoming packets. + */ +static struct rxrpc_connection * +rxrpc_publish_service_conn(struct rxrpc_peer *peer, + struct rxrpc_connection *conn) +{ + struct rxrpc_connection *cursor = NULL; + struct rxrpc_conn_proto k = conn->proto; + struct rb_node **pp, *parent; + + write_seqlock_bh(&peer->service_conn_lock); + + pp = &peer->service_conns.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + cursor = rb_entry(parent, + struct rxrpc_connection, service_node); + + if (cursor->proto.index_key < k.index_key) + pp = &(*pp)->rb_left; + else if (cursor->proto.index_key > k.index_key) + pp = &(*pp)->rb_right; + else + goto found_extant_conn; + } + + rb_link_node_rcu(&conn->service_node, parent, pp); + rb_insert_color(&conn->service_node, &peer->service_conns); +conn_published: + set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); + write_sequnlock_bh(&peer->service_conn_lock); + _leave(" = %d [new]", conn->debug_id); + return conn; + +found_extant_conn: + if (atomic_read(&cursor->usage) == 0) + goto replace_old_connection; + write_sequnlock_bh(&peer->service_conn_lock); + /* We should not be able to get here. rxrpc_incoming_connection() is + * called in a non-reentrant context, so there can't be a race to + * insert a new connection. + */ + BUG(); + +replace_old_connection: + /* The old connection is from an outdated epoch. */ + _debug("replace conn"); + rb_replace_node_rcu(&cursor->service_node, + &conn->service_node, + &peer->service_conns); + clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &cursor->flags); + goto conn_published; +} + +/* + * get a record of an incoming connection + */ +struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, + struct sk_buff *skb) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_peer *peer; + const char *new = "old"; + + _enter(""); + + peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); + if (!peer) { + _debug("no peer"); + return ERR_PTR(-EBUSY); + } + + ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); + + rcu_read_lock(); + peer = rxrpc_lookup_peer_rcu(local, srx); + if (peer) { + conn = rxrpc_find_service_conn_rcu(peer, skb); + if (conn) { + if (sp->hdr.securityIndex != conn->security_ix) + goto security_mismatch_rcu; + if (rxrpc_get_connection_maybe(conn)) + goto found_extant_connection_rcu; + + /* The conn has expired but we can't remove it without + * the appropriate lock, so we attempt to replace it + * when we have a new candidate. + */ + } + + if (!rxrpc_get_peer_maybe(peer)) + peer = NULL; + } + rcu_read_unlock(); + + if (!peer) { + peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); + if (!peer) + goto enomem; + } + + /* We don't have a matching record yet. */ + conn = rxrpc_alloc_connection(GFP_NOIO); + if (!conn) + goto enomem_peer; + + conn->proto.epoch = sp->hdr.epoch; + conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; + conn->params.local = local; + conn->params.peer = peer; + conn->params.service_id = sp->hdr.serviceId; + conn->security_ix = sp->hdr.securityIndex; + conn->out_clientflag = 0; + conn->state = RXRPC_CONN_SERVICE; + if (conn->params.service_id) + conn->state = RXRPC_CONN_SERVICE_UNSECURED; + + rxrpc_get_local(local); + + write_lock(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock(&rxrpc_connection_lock); + + /* Make the connection a target for incoming packets. */ + rxrpc_publish_service_conn(peer, conn); + + new = "new"; + +success: + _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid); + _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); + return conn; + +found_extant_connection_rcu: + rcu_read_unlock(); + goto success; + +security_mismatch_rcu: + rcu_read_unlock(); + _leave(" = -EKEYREJECTED"); + return ERR_PTR(-EKEYREJECTED); + +enomem_peer: + rxrpc_put_peer(peer); +enomem: + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * Remove the service connection from the peer's tree, thereby removing it as a + * target for incoming packets. + */ +void rxrpc_unpublish_service_conn(struct rxrpc_connection *conn) +{ + struct rxrpc_peer *peer = conn->params.peer; + + write_seqlock_bh(&peer->service_conn_lock); + if (test_and_clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags)) + rb_erase(&conn->service_node, &peer->service_conns); + write_sequnlock_bh(&peer->service_conn_lock); +} diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c new file mode 100644 index 000000000..70bb77818 --- /dev/null +++ b/net/rxrpc/input.c @@ -0,0 +1,757 @@ +/* RxRPC packet reception + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * queue a packet for recvmsg to pass to userspace + * - the caller must hold a lock on call->lock + * - must not be called with interrupts disabled (sk_filter() disables BH's) + * - eats the packet whether successful or not + * - there must be just one reference to the packet, which the caller passes to + * this function + */ +int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, + bool force, bool terminal) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = call->socket; + struct sock *sk; + int ret; + + _enter(",,%d,%d", force, terminal); + + ASSERT(!irqs_disabled()); + + sp = rxrpc_skb(skb); + ASSERTCMP(sp->call, ==, call); + + /* if we've already posted the terminal message for a call, then we + * don't post any more */ + if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { + _debug("already terminated"); + ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE); + rxrpc_free_skb(skb); + return 0; + } + + sk = &rx->sk; + + if (!force) { + /* cast skb->rcvbuf to unsigned... It's pointless, but + * reduces number of warnings when compiling with -W + * --ANK */ +// ret = -ENOBUFS; +// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= +// (unsigned int) sk->sk_rcvbuf) +// goto out; + + ret = sk_filter(sk, skb); + if (ret < 0) + goto out; + } + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) && + !test_bit(RXRPC_CALL_RELEASED, &call->flags) && + call->socket->sk.sk_state != RXRPC_CLOSE) { + skb->destructor = rxrpc_packet_destructor; + skb->dev = NULL; + skb->sk = sk; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + + if (terminal) { + _debug("<<<< TERMINAL MESSAGE >>>>"); + set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags); + } + + /* allow interception by a kernel service */ + if (rx->interceptor) { + rx->interceptor(sk, call->user_call_ID, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + } else { + _net("post skb %p", skb); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + } + skb = NULL; + } else { + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + ret = 0; + +out: + rxrpc_free_skb(skb); + + _leave(" = %d", ret); + return ret; +} + +/* + * process a DATA packet, posting the packet to the appropriate queue + * - eats the packet if successful + */ +static int rxrpc_fast_process_data(struct rxrpc_call *call, + struct sk_buff *skb, u32 seq) +{ + struct rxrpc_skb_priv *sp; + bool terminal; + int ret, ackbit, ack; + u32 serial; + u8 flags; + + _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); + + sp = rxrpc_skb(skb); + ASSERTCMP(sp->call, ==, NULL); + flags = sp->hdr.flags; + serial = sp->hdr.serial; + + spin_lock(&call->lock); + + if (call->state > RXRPC_CALL_COMPLETE) + goto discard; + + ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post); + ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv); + ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten); + + if (seq < call->rx_data_post) { + _debug("dup #%u [-%u]", seq, call->rx_data_post); + ack = RXRPC_ACK_DUPLICATE; + ret = -ENOBUFS; + goto discard_and_ack; + } + + /* we may already have the packet in the out of sequence queue */ + ackbit = seq - (call->rx_data_eaten + 1); + ASSERTCMP(ackbit, >=, 0); + if (__test_and_set_bit(ackbit, call->ackr_window)) { + _debug("dup oos #%u [%u,%u]", + seq, call->rx_data_eaten, call->rx_data_post); + ack = RXRPC_ACK_DUPLICATE; + goto discard_and_ack; + } + + if (seq >= call->ackr_win_top) { + _debug("exceed #%u [%u]", seq, call->ackr_win_top); + __clear_bit(ackbit, call->ackr_window); + ack = RXRPC_ACK_EXCEEDS_WINDOW; + goto discard_and_ack; + } + + if (seq == call->rx_data_expect) { + clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags); + call->rx_data_expect++; + } else if (seq > call->rx_data_expect) { + _debug("oos #%u [%u]", seq, call->rx_data_expect); + call->rx_data_expect = seq + 1; + if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) { + ack = RXRPC_ACK_OUT_OF_SEQUENCE; + goto enqueue_and_ack; + } + goto enqueue_packet; + } + + if (seq != call->rx_data_post) { + _debug("ahead #%u [%u]", seq, call->rx_data_post); + goto enqueue_packet; + } + + if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags)) + goto protocol_error; + + /* if the packet need security things doing to it, then it goes down + * the slow path */ + if (call->conn->security_ix) + goto enqueue_packet; + + sp->call = call; + rxrpc_get_call(call); + atomic_inc(&call->skb_count); + terminal = ((flags & RXRPC_LAST_PACKET) && + !(flags & RXRPC_CLIENT_INITIATED)); + ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOBUFS) { + __clear_bit(ackbit, call->ackr_window); + ack = RXRPC_ACK_NOSPACE; + goto discard_and_ack; + } + goto out; + } + + skb = NULL; + sp = NULL; + + _debug("post #%u", seq); + ASSERTCMP(call->rx_data_post, ==, seq); + call->rx_data_post++; + + if (flags & RXRPC_LAST_PACKET) + set_bit(RXRPC_CALL_RCVD_LAST, &call->flags); + + /* if we've reached an out of sequence packet then we need to drain + * that queue into the socket Rx queue now */ + if (call->rx_data_post == call->rx_first_oos) { + _debug("drain rx oos now"); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + } + + spin_unlock(&call->lock); + atomic_inc(&call->ackr_not_idle); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false); + _leave(" = 0 [posted]"); + return 0; + +protocol_error: + ret = -EBADMSG; +out: + spin_unlock(&call->lock); + _leave(" = %d", ret); + return ret; + +discard_and_ack: + _debug("discard and ACK packet %p", skb); + __rxrpc_propose_ACK(call, ack, serial, true); +discard: + spin_unlock(&call->lock); + rxrpc_free_skb(skb); + _leave(" = 0 [discarded]"); + return 0; + +enqueue_and_ack: + __rxrpc_propose_ACK(call, ack, serial, true); +enqueue_packet: + _net("defer skb %p", skb); + spin_unlock(&call->lock); + skb_queue_tail(&call->rx_queue, skb); + atomic_inc(&call->ackr_not_idle); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + _leave(" = 0 [queued]"); + return 0; +} + +/* + * assume an implicit ACKALL of the transmission phase of a client socket upon + * reception of the first reply packet + */ +static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) +{ + write_lock_bh(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + call->acks_latest = serial; + + _debug("implicit ACKALL %%%u", call->acks_latest); + set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events); + write_unlock_bh(&call->state_lock); + + if (try_to_del_timer_sync(&call->resend_timer) >= 0) { + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + break; + + default: + write_unlock_bh(&call->state_lock); + break; + } +} + +/* + * post an incoming packet to the nominated call to deal with + * - must get rid of the sk_buff, either by freeing it or by queuing it + */ +void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 wtmp; + u32 hi_serial, abort_code; + + _enter("%p,%p", call, skb); + + ASSERT(!irqs_disabled()); + +#if 0 // INJECT RX ERROR + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { + static int skip = 0; + if (++skip == 3) { + printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n"); + skip = 0; + goto free_packet; + } + } +#endif + + /* track the latest serial number on this connection for ACK packet + * information */ + hi_serial = atomic_read(&call->conn->hi_serial); + while (sp->hdr.serial > hi_serial) + hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, + sp->hdr.serial); + + /* request ACK generation for any ACK or DATA packet that requests + * it */ + if (sp->hdr.flags & RXRPC_REQUEST_ACK) { + _proto("ACK Requested on %%%u", sp->hdr.serial); + rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); + } + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_ABORT: + _debug("abort"); + + if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) + goto protocol_error; + + abort_code = ntohl(wtmp); + _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); + + write_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_REMOTELY_ABORTED; + call->remote_abort = abort_code; + set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + rxrpc_queue_call(call); + } + goto free_packet_unlock; + + case RXRPC_PACKET_TYPE_BUSY: + _proto("Rx BUSY %%%u", sp->hdr.serial); + + if (rxrpc_conn_is_service(call->conn)) + goto protocol_error; + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_SERVER_BUSY; + set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); + rxrpc_queue_call(call); + case RXRPC_CALL_SERVER_BUSY: + goto free_packet_unlock; + default: + goto protocol_error_locked; + } + + default: + _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); + goto protocol_error; + + case RXRPC_PACKET_TYPE_DATA: + _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); + + if (sp->hdr.seq == 0) + goto protocol_error; + + call->ackr_prev_seq = sp->hdr.seq; + + /* received data implicitly ACKs all of the request packets we + * sent when we're acting as a client */ + if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) + rxrpc_assume_implicit_ackall(call, sp->hdr.serial); + + switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) { + case 0: + skb = NULL; + goto done; + + default: + BUG(); + + /* data packet received beyond the last packet */ + case -EBADMSG: + goto protocol_error; + } + + case RXRPC_PACKET_TYPE_ACKALL: + case RXRPC_PACKET_TYPE_ACK: + /* ACK processing is done in process context */ + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) { + skb_queue_tail(&call->rx_queue, skb); + rxrpc_queue_call(call); + skb = NULL; + } + read_unlock_bh(&call->state_lock); + goto free_packet; + } + +protocol_error: + _debug("protocol error"); + write_lock_bh(&call->state_lock); +protocol_error_locked: + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_PROTOCOL_ERROR; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + rxrpc_queue_call(call); + } +free_packet_unlock: + write_unlock_bh(&call->state_lock); +free_packet: + rxrpc_free_skb(skb); +done: + _leave(""); +} + +/* + * split up a jumbo data packet + */ +static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, + struct sk_buff *jumbo) +{ + struct rxrpc_jumbo_header jhdr; + struct rxrpc_skb_priv *sp; + struct sk_buff *part; + + _enter(",{%u,%u}", jumbo->data_len, jumbo->len); + + sp = rxrpc_skb(jumbo); + + do { + sp->hdr.flags &= ~RXRPC_JUMBO_PACKET; + + /* make a clone to represent the first subpacket in what's left + * of the jumbo packet */ + part = skb_clone(jumbo, GFP_ATOMIC); + if (!part) { + /* simply ditch the tail in the event of ENOMEM */ + pskb_trim(jumbo, RXRPC_JUMBO_DATALEN); + break; + } + rxrpc_new_skb(part); + + pskb_trim(part, RXRPC_JUMBO_DATALEN); + + if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN)) + goto protocol_error; + + if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0) + goto protocol_error; + if (!pskb_pull(jumbo, sizeof(jhdr))) + BUG(); + + sp->hdr.seq += 1; + sp->hdr.serial += 1; + sp->hdr.flags = jhdr.flags; + sp->hdr._rsvd = ntohs(jhdr._rsvd); + + _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); + + rxrpc_fast_process_packet(call, part); + part = NULL; + + } while (sp->hdr.flags & RXRPC_JUMBO_PACKET); + + rxrpc_fast_process_packet(call, jumbo); + _leave(""); + return; + +protocol_error: + _debug("protocol error"); + rxrpc_free_skb(part); + rxrpc_free_skb(jumbo); + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_PROTOCOL_ERROR; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock_bh(&call->state_lock); + _leave(""); +} + +/* + * post an incoming packet to the appropriate call/socket to deal with + * - must get rid of the sk_buff, either by freeing it or by queuing it + */ +static void rxrpc_post_packet_to_call(struct rxrpc_call *call, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp; + + _enter("%p,%p", call, skb); + + sp = rxrpc_skb(skb); + + _debug("extant call [%d]", call->state); + + read_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_LOCALLY_ABORTED: + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) { + rxrpc_queue_call(call); + goto free_unlock; + } + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_NETWORK_ERROR: + case RXRPC_CALL_DEAD: + goto dead_call; + case RXRPC_CALL_COMPLETE: + case RXRPC_CALL_CLIENT_FINAL_ACK: + /* complete server call */ + if (rxrpc_conn_is_service(call->conn)) + goto dead_call; + /* resend last packet of a completed call */ + _debug("final ack again"); + rxrpc_get_call(call); + set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); + rxrpc_queue_call(call); + goto free_unlock; + default: + break; + } + + read_unlock(&call->state_lock); + rxrpc_get_call(call); + + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.flags & RXRPC_JUMBO_PACKET) + rxrpc_process_jumbo_packet(call, skb); + else + rxrpc_fast_process_packet(call, skb); + + rxrpc_put_call(call); + goto done; + +dead_call: + if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { + skb->priority = RX_CALL_DEAD; + rxrpc_reject_packet(call->conn->params.local, skb); + goto unlock; + } +free_unlock: + rxrpc_free_skb(skb); +unlock: + read_unlock(&call->state_lock); +done: + _leave(""); +} + +/* + * post connection-level events to the connection + * - this includes challenges, responses and some aborts + */ +static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + _enter("%p,%p", conn, skb); + + skb_queue_tail(&conn->rx_queue, skb); + rxrpc_queue_conn(conn); +} + +/* + * post endpoint-level events to the local endpoint + * - this includes debug and version messages + */ +static void rxrpc_post_packet_to_local(struct rxrpc_local *local, + struct sk_buff *skb) +{ + _enter("%p,%p", local, skb); + + skb_queue_tail(&local->event_queue, skb); + rxrpc_queue_local(local); +} + +/* + * Extract the wire header from a packet and translate the byte order. + */ +static noinline +int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + + /* dig out the RxRPC connection details */ + if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) + return -EBADMSG; + if (!pskb_pull(skb, sizeof(whdr))) + BUG(); + + memset(sp, 0, sizeof(*sp)); + sp->hdr.epoch = ntohl(whdr.epoch); + sp->hdr.cid = ntohl(whdr.cid); + sp->hdr.callNumber = ntohl(whdr.callNumber); + sp->hdr.seq = ntohl(whdr.seq); + sp->hdr.serial = ntohl(whdr.serial); + sp->hdr.flags = whdr.flags; + sp->hdr.type = whdr.type; + sp->hdr.userStatus = whdr.userStatus; + sp->hdr.securityIndex = whdr.securityIndex; + sp->hdr._rsvd = ntohs(whdr._rsvd); + sp->hdr.serviceId = ntohs(whdr.serviceId); + return 0; +} + +/* + * handle data received on the local endpoint + * - may be called in interrupt context + * + * The socket is locked by the caller and this prevents the socket from being + * shut down and the local endpoint from going away, thus sk_user_data will not + * be cleared until this function returns. + */ +void rxrpc_data_ready(struct sock *sk) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp; + struct rxrpc_local *local = sk->sk_user_data; + struct sk_buff *skb; + int ret; + + _enter("%p", sk); + + ASSERT(!irqs_disabled()); + + skb = skb_recv_datagram(sk, 0, 1, &ret); + if (!skb) { + if (ret == -EAGAIN) + return; + _debug("UDP socket error %d", ret); + return; + } + + rxrpc_new_skb(skb); + + _net("recv skb %p", skb); + + /* we'll probably need to checksum it (didn't call sock_recvmsg) */ + if (skb_checksum_complete(skb)) { + rxrpc_free_skb(skb); + __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); + _leave(" [CSUM failed]"); + return; + } + + __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0); + + /* The socket buffer we have is owned by UDP, with UDP's data all over + * it, but we really want our own data there. + */ + skb_orphan(skb); + sp = rxrpc_skb(skb); + + _net("Rx UDP packet from %08x:%04hu", + ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); + + /* dig out the RxRPC connection details */ + if (rxrpc_extract_header(sp, skb) < 0) + goto bad_message; + + _net("Rx RxRPC %s ep=%x call=%x:%x", + sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", + sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); + + if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || + !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { + _proto("Rx Bad Packet Type %u", sp->hdr.type); + goto bad_message; + } + + if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { + rxrpc_post_packet_to_local(local, skb); + goto out; + } + + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) + goto bad_message; + + rcu_read_lock(); + + conn = rxrpc_find_connection_rcu(local, skb); + if (!conn) + goto cant_route_call; + + if (sp->hdr.callNumber == 0) { + /* Connection-level packet */ + _debug("CONN %p {%d}", conn, conn->debug_id); + rxrpc_post_packet_to_conn(conn, skb); + } else { + /* Call-bound packets are routed by connection channel. */ + unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK; + struct rxrpc_channel *chan = &conn->channels[channel]; + struct rxrpc_call *call = rcu_dereference(chan->call); + + if (!call || atomic_read(&call->usage) == 0) + goto cant_route_call; + + rxrpc_post_packet_to_call(call, skb); + } + + rcu_read_unlock(); +out: + return; + +cant_route_call: + rcu_read_unlock(); + + _debug("can't route call"); + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && + sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { + if (sp->hdr.seq == 1) { + _debug("first packet"); + skb_queue_tail(&local->accept_queue, skb); + rxrpc_queue_work(&local->processor); + _leave(" [incoming]"); + return; + } + skb->priority = RX_INVALID_OPERATION; + } else { + skb->priority = RX_CALL_DEAD; + } + + if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { + _debug("reject type %d",sp->hdr.type); + rxrpc_reject_packet(local, skb); + } else { + rxrpc_free_skb(skb); + } + _leave(" [no call]"); + return; + +bad_message: + skb->priority = RX_PROTOCOL_ERROR; + rxrpc_reject_packet(local, skb); + _leave(" [badmsg]"); +} diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index e57140361..c21ad213b 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -17,11 +17,12 @@ static int none_init_connection_security(struct rxrpc_connection *conn) return 0; } -static void none_prime_packet_security(struct rxrpc_connection *conn) +static int none_prime_packet_security(struct rxrpc_connection *conn) { + return 0; } -static int none_secure_packet(const struct rxrpc_call *call, +static int none_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, size_t data_size, void *sechdr) @@ -29,7 +30,7 @@ static int none_secure_packet(const struct rxrpc_call *call, return 0; } -static int none_verify_packet(const struct rxrpc_call *call, +static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, u32 *_abort_code) { diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c new file mode 100644 index 000000000..18c737a61 --- /dev/null +++ b/net/rxrpc/key.c @@ -0,0 +1,1237 @@ +/* RxRPC key management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * RxRPC keys should have a description of describing their purpose: + * "afs@CAMBRIDGE.REDHAT.COM> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static int rxrpc_vet_description_s(const char *); +static int rxrpc_preparse(struct key_preparsed_payload *); +static int rxrpc_preparse_s(struct key_preparsed_payload *); +static void rxrpc_free_preparse(struct key_preparsed_payload *); +static void rxrpc_free_preparse_s(struct key_preparsed_payload *); +static void rxrpc_destroy(struct key *); +static void rxrpc_destroy_s(struct key *); +static void rxrpc_describe(const struct key *, struct seq_file *); +static long rxrpc_read(const struct key *, char __user *, size_t); + +/* + * rxrpc defined keys take an arbitrary string as the description and an + * arbitrary blob of data as the payload + */ +struct key_type key_type_rxrpc = { + .name = "rxrpc", + .preparse = rxrpc_preparse, + .free_preparse = rxrpc_free_preparse, + .instantiate = generic_key_instantiate, + .destroy = rxrpc_destroy, + .describe = rxrpc_describe, + .read = rxrpc_read, +}; +EXPORT_SYMBOL(key_type_rxrpc); + +/* + * rxrpc server defined keys take ":" as the + * description and an 8-byte decryption key as the payload + */ +struct key_type key_type_rxrpc_s = { + .name = "rxrpc_s", + .vet_description = rxrpc_vet_description_s, + .preparse = rxrpc_preparse_s, + .free_preparse = rxrpc_free_preparse_s, + .instantiate = generic_key_instantiate, + .destroy = rxrpc_destroy_s, + .describe = rxrpc_describe, +}; + +/* + * Vet the description for an RxRPC server key + */ +static int rxrpc_vet_description_s(const char *desc) +{ + unsigned long num; + char *p; + + num = simple_strtoul(desc, &p, 10); + if (*p != ':' || num > 65535) + return -EINVAL; + num = simple_strtoul(p + 1, &p, 10); + if (*p || num < 1 || num > 255) + return -EINVAL; + return 0; +} + +/* + * parse an RxKAD type XDR format token + * - the caller guarantees we have at least 4 words + */ +static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + size_t plen; + u32 tktlen; + + _enter(",{%x,%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (toklen <= 8 * 4) + return -EKEYREJECTED; + tktlen = ntohl(xdr[7]); + _debug("tktlen: %x", tktlen); + if (tktlen > AFSTOKEN_RK_TIX_MAX) + return -EKEYREJECTED; + if (toklen < 8 * 4 + tktlen) + return -EKEYREJECTED; + + plen = sizeof(*token) + sizeof(*token->kad) + tktlen; + prep->quotalen = datalen + plen; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + return -ENOMEM; + + token->kad = kzalloc(plen, GFP_KERNEL); + if (!token->kad) { + kfree(token); + return -ENOMEM; + } + + token->security_index = RXRPC_SECURITY_RXKAD; + token->kad->ticket_len = tktlen; + token->kad->vice_id = ntohl(xdr[0]); + token->kad->kvno = ntohl(xdr[1]); + token->kad->start = ntohl(xdr[4]); + token->kad->expiry = ntohl(xdr[5]); + token->kad->primary_flag = ntohl(xdr[6]); + memcpy(&token->kad->session_key, &xdr[2], 8); + memcpy(&token->kad->ticket, &xdr[8], tktlen); + + _debug("SCIX: %u", token->security_index); + _debug("TLEN: %u", token->kad->ticket_len); + _debug("EXPY: %x", token->kad->expiry); + _debug("KVNO: %u", token->kad->kvno); + _debug("PRIM: %u", token->kad->primary_flag); + _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", + token->kad->session_key[0], token->kad->session_key[1], + token->kad->session_key[2], token->kad->session_key[3], + token->kad->session_key[4], token->kad->session_key[5], + token->kad->session_key[6], token->kad->session_key[7]); + if (token->kad->ticket_len >= 8) + _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", + token->kad->ticket[0], token->kad->ticket[1], + token->kad->ticket[2], token->kad->ticket[3], + token->kad->ticket[4], token->kad->ticket[5], + token->kad->ticket[6], token->kad->ticket[7]); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + if (token->kad->expiry < prep->expiry) + prep->expiry = token->kad->expiry; + + _leave(" = 0"); + return 0; +} + +static void rxrpc_free_krb5_principal(struct krb5_principal *princ) +{ + int loop; + + if (princ->name_parts) { + for (loop = princ->n_name_parts - 1; loop >= 0; loop--) + kfree(princ->name_parts[loop]); + kfree(princ->name_parts); + } + kfree(princ->realm); +} + +static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td) +{ + kfree(td->data); +} + +/* + * free up an RxK5 token + */ +static void rxrpc_rxk5_free(struct rxk5_key *rxk5) +{ + int loop; + + rxrpc_free_krb5_principal(&rxk5->client); + rxrpc_free_krb5_principal(&rxk5->server); + rxrpc_free_krb5_tagged(&rxk5->session); + + if (rxk5->addresses) { + for (loop = rxk5->n_addresses - 1; loop >= 0; loop--) + rxrpc_free_krb5_tagged(&rxk5->addresses[loop]); + kfree(rxk5->addresses); + } + if (rxk5->authdata) { + for (loop = rxk5->n_authdata - 1; loop >= 0; loop--) + rxrpc_free_krb5_tagged(&rxk5->authdata[loop]); + kfree(rxk5->authdata); + } + + kfree(rxk5->ticket); + kfree(rxk5->ticket2); + kfree(rxk5); +} + +/* + * extract a krb5 principal + */ +static int rxrpc_krb5_decode_principal(struct krb5_principal *princ, + const __be32 **_xdr, + unsigned int *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned int toklen = *_toklen, n_parts, loop, tmp; + + /* there must be at least one name, and at least #names+1 length + * words */ + if (toklen <= 12) + return -EINVAL; + + _enter(",{%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen); + + n_parts = ntohl(*xdr++); + toklen -= 4; + if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX) + return -EINVAL; + princ->n_name_parts = n_parts; + + if (toklen <= (n_parts + 1) * 4) + return -EINVAL; + + princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL); + if (!princ->name_parts) + return -ENOMEM; + + for (loop = 0; loop < n_parts; loop++) { + if (toklen < 4) + return -EINVAL; + tmp = ntohl(*xdr++); + toklen -= 4; + if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX) + return -EINVAL; + if (tmp > toklen) + return -EINVAL; + princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL); + if (!princ->name_parts[loop]) + return -ENOMEM; + memcpy(princ->name_parts[loop], xdr, tmp); + princ->name_parts[loop][tmp] = 0; + tmp = (tmp + 3) & ~3; + toklen -= tmp; + xdr += tmp >> 2; + } + + if (toklen < 4) + return -EINVAL; + tmp = ntohl(*xdr++); + toklen -= 4; + if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX) + return -EINVAL; + if (tmp > toklen) + return -EINVAL; + princ->realm = kmalloc(tmp + 1, GFP_KERNEL); + if (!princ->realm) + return -ENOMEM; + memcpy(princ->realm, xdr, tmp); + princ->realm[tmp] = 0; + tmp = (tmp + 3) & ~3; + toklen -= tmp; + xdr += tmp >> 2; + + _debug("%s/...@%s", princ->name_parts[0], princ->realm); + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract a piece of krb5 tagged data + */ +static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td, + size_t max_data_size, + const __be32 **_xdr, + unsigned int *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned int toklen = *_toklen, len; + + /* there must be at least one tag and one length word */ + if (toklen <= 8) + return -EINVAL; + + _enter(",%zu,{%x,%x},%u", + max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen); + + td->tag = ntohl(*xdr++); + len = ntohl(*xdr++); + toklen -= 8; + if (len > max_data_size) + return -EINVAL; + td->data_len = len; + + if (len > 0) { + td->data = kmemdup(xdr, len, GFP_KERNEL); + if (!td->data) + return -ENOMEM; + len = (len + 3) & ~3; + toklen -= len; + xdr += len >> 2; + } + + _debug("tag %x len %x", td->tag, td->data_len); + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract an array of tagged data + */ +static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, + u8 *_n_elem, + u8 max_n_elem, + size_t max_elem_size, + const __be32 **_xdr, + unsigned int *_toklen) +{ + struct krb5_tagged_data *td; + const __be32 *xdr = *_xdr; + unsigned int toklen = *_toklen, n_elem, loop; + int ret; + + /* there must be at least one count */ + if (toklen < 4) + return -EINVAL; + + _enter(",,%u,%zu,{%x},%u", + max_n_elem, max_elem_size, ntohl(xdr[0]), toklen); + + n_elem = ntohl(*xdr++); + toklen -= 4; + if (n_elem > max_n_elem) + return -EINVAL; + *_n_elem = n_elem; + if (n_elem > 0) { + if (toklen <= (n_elem + 1) * 4) + return -EINVAL; + + _debug("n_elem %d", n_elem); + + td = kcalloc(n_elem, sizeof(struct krb5_tagged_data), + GFP_KERNEL); + if (!td) + return -ENOMEM; + *_td = td; + + for (loop = 0; loop < n_elem; loop++) { + ret = rxrpc_krb5_decode_tagged_data(&td[loop], + max_elem_size, + &xdr, &toklen); + if (ret < 0) + return ret; + } + } + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract a krb5 ticket + */ +static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen, + const __be32 **_xdr, unsigned int *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned int toklen = *_toklen, len; + + /* there must be at least one length word */ + if (toklen <= 4) + return -EINVAL; + + _enter(",{%x},%u", ntohl(xdr[0]), toklen); + + len = ntohl(*xdr++); + toklen -= 4; + if (len > AFSTOKEN_K5_TIX_MAX) + return -EINVAL; + *_tktlen = len; + + _debug("ticket len %u", len); + + if (len > 0) { + *_ticket = kmemdup(xdr, len, GFP_KERNEL); + if (!*_ticket) + return -ENOMEM; + len = (len + 3) & ~3; + toklen -= len; + xdr += len >> 2; + } + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * parse an RxK5 type XDR format token + * - the caller guarantees we have at least 4 words + */ +static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + struct rxk5_key *rxk5; + const __be32 *end_xdr = xdr + (toklen >> 2); + int ret; + + _enter(",{%x,%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + /* reserve some payload space for this subkey - the length of the token + * is a reasonable approximation */ + prep->quotalen = datalen + toklen; + + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + return -ENOMEM; + + rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL); + if (!rxk5) { + kfree(token); + return -ENOMEM; + } + + token->security_index = RXRPC_SECURITY_RXK5; + token->k5 = rxk5; + + /* extract the principals */ + ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen); + if (ret < 0) + goto error; + ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen); + if (ret < 0) + goto error; + + /* extract the session key and the encoding type (the tag field -> + * ENCTYPE_xxx) */ + ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + if (toklen < 4 * 8 + 2 * 4) + goto inval; + rxk5->authtime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->starttime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->endtime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->renew_till = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->is_skey = ntohl(*xdr++); + rxk5->flags = ntohl(*xdr++); + toklen -= 4 * 8 + 2 * 4; + + _debug("times: a=%llx s=%llx e=%llx rt=%llx", + rxk5->authtime, rxk5->starttime, rxk5->endtime, + rxk5->renew_till); + _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags); + + /* extract the permitted client addresses */ + ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses, + &rxk5->n_addresses, + AFSTOKEN_K5_ADDRESSES_MAX, + AFSTOKEN_DATA_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + /* extract the tickets */ + ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len, + &xdr, &toklen); + if (ret < 0) + goto error; + ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + /* extract the typed auth data */ + ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata, + &rxk5->n_authdata, + AFSTOKEN_K5_AUTHDATA_MAX, + AFSTOKEN_BDATALN_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + if (toklen != 0) + goto inval; + + /* attach the payload */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + if (token->kad->expiry < prep->expiry) + prep->expiry = token->kad->expiry; + + _leave(" = 0"); + return 0; + +inval: + ret = -EINVAL; +error: + rxrpc_rxk5_free(rxk5); + kfree(token); + _leave(" = %d", ret); + return ret; +} + +/* + * attempt to parse the data as the XDR format + * - the caller guarantees we have more than 7 words + */ +static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) +{ + const __be32 *xdr = prep->data, *token; + const char *cp; + unsigned int len, tmp, loop, ntoken, toklen, sec_ix; + size_t datalen = prep->datalen; + int ret; + + _enter(",{%x,%x,%x,%x},%zu", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + prep->datalen); + + if (datalen > AFSTOKEN_LENGTH_MAX) + goto not_xdr; + + /* XDR is an array of __be32's */ + if (datalen & 3) + goto not_xdr; + + /* the flags should be 0 (the setpag bit must be handled by + * userspace) */ + if (ntohl(*xdr++) != 0) + goto not_xdr; + datalen -= 4; + + /* check the cell name */ + len = ntohl(*xdr++); + if (len < 1 || len > AFSTOKEN_CELL_MAX) + goto not_xdr; + datalen -= 4; + tmp = (len + 3) & ~3; + if (tmp > datalen) + goto not_xdr; + + cp = (const char *) xdr; + for (loop = 0; loop < len; loop++) + if (!isprint(cp[loop])) + goto not_xdr; + if (len < tmp) + for (; loop < tmp; loop++) + if (cp[loop]) + goto not_xdr; + _debug("cellname: [%u/%u] '%*.*s'", + len, tmp, len, len, (const char *) xdr); + datalen -= tmp; + xdr += tmp >> 2; + + /* get the token count */ + if (datalen < 12) + goto not_xdr; + ntoken = ntohl(*xdr++); + datalen -= 4; + _debug("ntoken: %x", ntoken); + if (ntoken < 1 || ntoken > AFSTOKEN_MAX) + goto not_xdr; + + /* check each token wrapper */ + token = xdr; + loop = ntoken; + do { + if (datalen < 8) + goto not_xdr; + toklen = ntohl(*xdr++); + sec_ix = ntohl(*xdr); + datalen -= 4; + _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix); + if (toklen < 20 || toklen > datalen) + goto not_xdr; + datalen -= (toklen + 3) & ~3; + xdr += (toklen + 3) >> 2; + + } while (--loop > 0); + + _debug("remainder: %zu", datalen); + if (datalen != 0) + goto not_xdr; + + /* okay: we're going to assume it's valid XDR format + * - we ignore the cellname, relying on the key to be correctly named + */ + do { + xdr = token; + toklen = ntohl(*xdr++); + token = xdr + ((toklen + 3) >> 2); + sec_ix = ntohl(*xdr++); + toklen -= 4; + + _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token); + + switch (sec_ix) { + case RXRPC_SECURITY_RXKAD: + ret = rxrpc_preparse_xdr_rxkad(prep, datalen, xdr, toklen); + if (ret != 0) + goto error; + break; + + case RXRPC_SECURITY_RXK5: + ret = rxrpc_preparse_xdr_rxk5(prep, datalen, xdr, toklen); + if (ret != 0) + goto error; + break; + + default: + ret = -EPROTONOSUPPORT; + goto error; + } + + } while (--ntoken > 0); + + _leave(" = 0"); + return 0; + +not_xdr: + _leave(" = -EPROTO"); + return -EPROTO; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Preparse an rxrpc defined key. + * + * Data should be of the form: + * OFFSET LEN CONTENT + * 0 4 key interface version number + * 4 2 security index (type) + * 6 2 ticket length + * 8 4 key expiry time (time_t) + * 12 4 kvno + * 16 8 session key + * 24 [len] ticket + * + * if no data is provided, then a no-security key is made + */ +static int rxrpc_preparse(struct key_preparsed_payload *prep) +{ + const struct rxrpc_key_data_v1 *v1; + struct rxrpc_key_token *token, **pp; + size_t plen; + u32 kver; + int ret; + + _enter("%zu", prep->datalen); + + /* handle a no-security key */ + if (!prep->data && prep->datalen == 0) + return 0; + + /* determine if the XDR payload format is being used */ + if (prep->datalen > 7 * 4) { + ret = rxrpc_preparse_xdr(prep); + if (ret != -EPROTO) + return ret; + } + + /* get the key interface version number */ + ret = -EINVAL; + if (prep->datalen <= 4 || !prep->data) + goto error; + memcpy(&kver, prep->data, sizeof(kver)); + prep->data += sizeof(kver); + prep->datalen -= sizeof(kver); + + _debug("KEY I/F VERSION: %u", kver); + + ret = -EKEYREJECTED; + if (kver != 1) + goto error; + + /* deal with a version 1 key */ + ret = -EINVAL; + if (prep->datalen < sizeof(*v1)) + goto error; + + v1 = prep->data; + if (prep->datalen != sizeof(*v1) + v1->ticket_length) + goto error; + + _debug("SCIX: %u", v1->security_index); + _debug("TLEN: %u", v1->ticket_length); + _debug("EXPY: %x", v1->expiry); + _debug("KVNO: %u", v1->kvno); + _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", + v1->session_key[0], v1->session_key[1], + v1->session_key[2], v1->session_key[3], + v1->session_key[4], v1->session_key[5], + v1->session_key[6], v1->session_key[7]); + if (v1->ticket_length >= 8) + _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", + v1->ticket[0], v1->ticket[1], + v1->ticket[2], v1->ticket[3], + v1->ticket[4], v1->ticket[5], + v1->ticket[6], v1->ticket[7]); + + ret = -EPROTONOSUPPORT; + if (v1->security_index != RXRPC_SECURITY_RXKAD) + goto error; + + plen = sizeof(*token->kad) + v1->ticket_length; + prep->quotalen = plen + sizeof(*token); + + ret = -ENOMEM; + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto error; + token->kad = kzalloc(plen, GFP_KERNEL); + if (!token->kad) + goto error_free; + + token->security_index = RXRPC_SECURITY_RXKAD; + token->kad->ticket_len = v1->ticket_length; + token->kad->expiry = v1->expiry; + token->kad->kvno = v1->kvno; + memcpy(&token->kad->session_key, &v1->session_key, 8); + memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + pp = (struct rxrpc_key_token **)&prep->payload.data[0]; + while (*pp) + pp = &(*pp)->next; + *pp = token; + if (token->kad->expiry < prep->expiry) + prep->expiry = token->kad->expiry; + token = NULL; + ret = 0; + +error_free: + kfree(token); +error: + return ret; +} + +/* + * Free token list. + */ +static void rxrpc_free_token_list(struct rxrpc_key_token *token) +{ + struct rxrpc_key_token *next; + + for (; token; token = next) { + next = token->next; + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + kfree(token->kad); + break; + case RXRPC_SECURITY_RXK5: + if (token->k5) + rxrpc_rxk5_free(token->k5); + break; + default: + pr_err("Unknown token type %x on rxrpc key\n", + token->security_index); + BUG(); + } + + kfree(token); + } +} + +/* + * Clean up preparse data. + */ +static void rxrpc_free_preparse(struct key_preparsed_payload *prep) +{ + rxrpc_free_token_list(prep->payload.data[0]); +} + +/* + * Preparse a server secret key. + * + * The data should be the 8-byte secret key. + */ +static int rxrpc_preparse_s(struct key_preparsed_payload *prep) +{ + struct crypto_skcipher *ci; + + _enter("%zu", prep->datalen); + + if (prep->datalen != 8) + return -EINVAL; + + memcpy(&prep->payload.data[2], prep->data, 8); + + ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ci)) { + _leave(" = %ld", PTR_ERR(ci)); + return PTR_ERR(ci); + } + + if (crypto_skcipher_setkey(ci, prep->data, 8) < 0) + BUG(); + + prep->payload.data[0] = ci; + _leave(" = 0"); + return 0; +} + +/* + * Clean up preparse data. + */ +static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) +{ + if (prep->payload.data[0]) + crypto_free_skcipher(prep->payload.data[0]); +} + +/* + * dispose of the data dangling from the corpse of a rxrpc key + */ +static void rxrpc_destroy(struct key *key) +{ + rxrpc_free_token_list(key->payload.data[0]); +} + +/* + * dispose of the data dangling from the corpse of a rxrpc key + */ +static void rxrpc_destroy_s(struct key *key) +{ + if (key->payload.data[0]) { + crypto_free_skcipher(key->payload.data[0]); + key->payload.data[0] = NULL; + } +} + +/* + * describe the rxrpc key + */ +static void rxrpc_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); +} + +/* + * grab the security key for a socket + */ +int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = memdup_user_nul(optval, optlen); + if (IS_ERR(description)) + return PTR_ERR(description); + + key = request_key(&key_type_rxrpc, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->key = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} + +/* + * grab the security keyring for a server socket + */ +int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, + int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = memdup_user_nul(optval, optlen); + if (IS_ERR(description)) + return PTR_ERR(description); + + key = request_key(&key_type_keyring, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->securities = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} + +/* + * generate a server data key + */ +int rxrpc_get_server_data_key(struct rxrpc_connection *conn, + const void *session_key, + time_t expiry, + u32 kvno) +{ + const struct cred *cred = current_cred(); + struct key *key; + int ret; + + struct { + u32 kver; + struct rxrpc_key_data_v1 v1; + } data; + + _enter(""); + + key = key_alloc(&key_type_rxrpc, "x", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + return -ENOMEM; + } + + _debug("key %d", key_serial(key)); + + data.kver = 1; + data.v1.security_index = RXRPC_SECURITY_RXKAD; + data.v1.ticket_length = 0; + data.v1.expiry = expiry; + data.v1.kvno = 0; + + memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key)); + + ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL); + if (ret < 0) + goto error; + + conn->params.key = key; + _leave(" = 0 [%d]", key_serial(key)); + return 0; + +error: + key_revoke(key); + key_put(key); + _leave(" = -ENOMEM [ins %d]", ret); + return -ENOMEM; +} +EXPORT_SYMBOL(rxrpc_get_server_data_key); + +/** + * rxrpc_get_null_key - Generate a null RxRPC key + * @keyname: The name to give the key. + * + * Generate a null RxRPC key that can be used to indicate anonymous security is + * required for a particular domain. + */ +struct key *rxrpc_get_null_key(const char *keyname) +{ + const struct cred *cred = current_cred(); + struct key *key; + int ret; + + key = key_alloc(&key_type_rxrpc, keyname, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) + return key; + + ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL); + if (ret < 0) { + key_revoke(key); + key_put(key); + return ERR_PTR(ret); + } + + return key; +} +EXPORT_SYMBOL(rxrpc_get_null_key); + +/* + * read the contents of an rxrpc key + * - this returns the result in XDR form + */ +static long rxrpc_read(const struct key *key, + char __user *buffer, size_t buflen) +{ + const struct rxrpc_key_token *token; + const struct krb5_principal *princ; + size_t size; + __be32 __user *xdr, *oldxdr; + u32 cnlen, toksize, ntoks, tok, zero; + u16 toksizes[AFSTOKEN_MAX]; + int loop; + + _enter(""); + + /* we don't know what form we should return non-AFS keys in */ + if (memcmp(key->description, "afs@", 4) != 0) + return -EOPNOTSUPP; + cnlen = strlen(key->description + 4); + +#define RND(X) (((X) + 3) & ~3) + + /* AFS keys we return in XDR form, so we need to work out the size of + * the XDR */ + size = 2 * 4; /* flags, cellname len */ + size += RND(cnlen); /* cellname */ + size += 1 * 4; /* token count */ + + ntoks = 0; + for (token = key->payload.data[0]; token; token = token->next) { + toksize = 4; /* sec index */ + + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + toksize += 8 * 4; /* viceid, kvno, key*2, begin, + * end, primary, tktlen */ + toksize += RND(token->kad->ticket_len); + break; + + case RXRPC_SECURITY_RXK5: + princ = &token->k5->client; + toksize += 4 + princ->n_name_parts * 4; + for (loop = 0; loop < princ->n_name_parts; loop++) + toksize += RND(strlen(princ->name_parts[loop])); + toksize += 4 + RND(strlen(princ->realm)); + + princ = &token->k5->server; + toksize += 4 + princ->n_name_parts * 4; + for (loop = 0; loop < princ->n_name_parts; loop++) + toksize += RND(strlen(princ->name_parts[loop])); + toksize += 4 + RND(strlen(princ->realm)); + + toksize += 8 + RND(token->k5->session.data_len); + + toksize += 4 * 8 + 2 * 4; + + toksize += 4 + token->k5->n_addresses * 8; + for (loop = 0; loop < token->k5->n_addresses; loop++) + toksize += RND(token->k5->addresses[loop].data_len); + + toksize += 4 + RND(token->k5->ticket_len); + toksize += 4 + RND(token->k5->ticket2_len); + + toksize += 4 + token->k5->n_authdata * 8; + for (loop = 0; loop < token->k5->n_authdata; loop++) + toksize += RND(token->k5->authdata[loop].data_len); + break; + + default: /* we have a ticket we can't encode */ + BUG(); + continue; + } + + _debug("token[%u]: toksize=%u", ntoks, toksize); + ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX); + + toksizes[ntoks++] = toksize; + size += toksize + 4; /* each token has a length word */ + } + +#undef RND + + if (!buffer || buflen < size) + return size; + + xdr = (__be32 __user *) buffer; + zero = 0; +#define ENCODE(x) \ + do { \ + __be32 y = htonl(x); \ + if (put_user(y, xdr++) < 0) \ + goto fault; \ + } while(0) +#define ENCODE_DATA(l, s) \ + do { \ + u32 _l = (l); \ + ENCODE(l); \ + if (copy_to_user(xdr, (s), _l) != 0) \ + goto fault; \ + if (_l & 3 && \ + copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ + goto fault; \ + xdr += (_l + 3) >> 2; \ + } while(0) +#define ENCODE64(x) \ + do { \ + __be64 y = cpu_to_be64(x); \ + if (copy_to_user(xdr, &y, 8) != 0) \ + goto fault; \ + xdr += 8 >> 2; \ + } while(0) +#define ENCODE_STR(s) \ + do { \ + const char *_s = (s); \ + ENCODE_DATA(strlen(_s), _s); \ + } while(0) + + ENCODE(0); /* flags */ + ENCODE_DATA(cnlen, key->description + 4); /* cellname */ + ENCODE(ntoks); + + tok = 0; + for (token = key->payload.data[0]; token; token = token->next) { + toksize = toksizes[tok++]; + ENCODE(toksize); + oldxdr = xdr; + ENCODE(token->security_index); + + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + ENCODE(token->kad->vice_id); + ENCODE(token->kad->kvno); + ENCODE_DATA(8, token->kad->session_key); + ENCODE(token->kad->start); + ENCODE(token->kad->expiry); + ENCODE(token->kad->primary_flag); + ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); + break; + + case RXRPC_SECURITY_RXK5: + princ = &token->k5->client; + ENCODE(princ->n_name_parts); + for (loop = 0; loop < princ->n_name_parts; loop++) + ENCODE_STR(princ->name_parts[loop]); + ENCODE_STR(princ->realm); + + princ = &token->k5->server; + ENCODE(princ->n_name_parts); + for (loop = 0; loop < princ->n_name_parts; loop++) + ENCODE_STR(princ->name_parts[loop]); + ENCODE_STR(princ->realm); + + ENCODE(token->k5->session.tag); + ENCODE_DATA(token->k5->session.data_len, + token->k5->session.data); + + ENCODE64(token->k5->authtime); + ENCODE64(token->k5->starttime); + ENCODE64(token->k5->endtime); + ENCODE64(token->k5->renew_till); + ENCODE(token->k5->is_skey); + ENCODE(token->k5->flags); + + ENCODE(token->k5->n_addresses); + for (loop = 0; loop < token->k5->n_addresses; loop++) { + ENCODE(token->k5->addresses[loop].tag); + ENCODE_DATA(token->k5->addresses[loop].data_len, + token->k5->addresses[loop].data); + } + + ENCODE_DATA(token->k5->ticket_len, token->k5->ticket); + ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2); + + ENCODE(token->k5->n_authdata); + for (loop = 0; loop < token->k5->n_authdata; loop++) { + ENCODE(token->k5->authdata[loop].tag); + ENCODE_DATA(token->k5->authdata[loop].data_len, + token->k5->authdata[loop].data); + } + break; + + default: + BUG(); + break; + } + + ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==, + toksize); + } + +#undef ENCODE_STR +#undef ENCODE_DATA +#undef ENCODE64 +#undef ENCODE + + ASSERTCMP(tok, ==, ntoks); + ASSERTCMP((char __user *) xdr - buffer, ==, size); + _leave(" = %zu", size); + return size; + +fault: + _leave(" = -EFAULT"); + return -EFAULT; +} diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c new file mode 100644 index 000000000..31a3f86ef --- /dev/null +++ b/net/rxrpc/local_event.c @@ -0,0 +1,116 @@ +/* AF_RXRPC local endpoint management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; + +/* + * Reply to a version request + */ +static void rxrpc_send_version_request(struct rxrpc_local *local, + struct rxrpc_host_header *hdr, + struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct sockaddr_in sin; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter(""); + + sin.sin_family = AF_INET; + sin.sin_port = udp_hdr(skb)->source; + sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + + msg.msg_name = &sin; + msg.msg_namelen = sizeof(sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = 0; + whdr.serial = 0; + whdr.type = RXRPC_PACKET_TYPE_VERSION; + whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); + whdr.userStatus = 0; + whdr.securityIndex = 0; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = (char *)rxrpc_version_string; + iov[1].iov_len = sizeof(rxrpc_version_string); + + len = iov[0].iov_len + iov[1].iov_len; + + _proto("Tx VERSION (reply)"); + + ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); + if (ret < 0) + _debug("sendmsg failed: %d", ret); + + _leave(""); +} + +/* + * Process event packets targetted at a local endpoint. + */ +void rxrpc_process_local_events(struct rxrpc_local *local) +{ + struct sk_buff *skb; + char v; + + _enter(""); + + skb = skb_dequeue(&local->event_queue); + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + _debug("{%d},{%u}", local->debug_id, sp->hdr.type); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: + if (skb_copy_bits(skb, 0, &v, 1) < 0) + return; + _proto("Rx VERSION { %02x }", v); + if (v == 0) + rxrpc_send_version_request(local, &sp->hdr, skb); + break; + + default: + /* Just ignore anything we don't understand */ + break; + } + + rxrpc_free_skb(skb); + } + + _leave(""); +} diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c new file mode 100644 index 000000000..a753796fb --- /dev/null +++ b/net/rxrpc/local_object.c @@ -0,0 +1,390 @@ +/* Local endpoint object management + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static void rxrpc_local_processor(struct work_struct *); +static void rxrpc_local_rcu(struct rcu_head *); + +static DEFINE_MUTEX(rxrpc_local_mutex); +static LIST_HEAD(rxrpc_local_endpoints); + +/* + * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, + * same or greater than. + * + * We explicitly don't compare the RxRPC service ID as we want to reject + * conflicting uses by differing services. Further, we don't want to share + * addresses with different options (IPv6), so we don't compare those bits + * either. + */ +static long rxrpc_local_cmp_key(const struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx) +{ + long diff; + + diff = ((local->srx.transport_type - srx->transport_type) ?: + (local->srx.transport_len - srx->transport_len) ?: + (local->srx.transport.family - srx->transport.family)); + if (diff != 0) + return diff; + + switch (srx->transport.family) { + case AF_INET: + /* If the choice of UDP port is left up to the transport, then + * the endpoint record doesn't match. + */ + return ((u16 __force)local->srx.transport.sin.sin_port - + (u16 __force)srx->transport.sin.sin_port) ?: + memcmp(&local->srx.transport.sin.sin_addr, + &srx->transport.sin.sin_addr, + sizeof(struct in_addr)); + default: + BUG(); + } +} + +/* + * Allocate a new local endpoint. + */ +static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx) +{ + struct rxrpc_local *local; + + local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); + if (local) { + atomic_set(&local->usage, 1); + INIT_LIST_HEAD(&local->link); + INIT_WORK(&local->processor, rxrpc_local_processor); + INIT_LIST_HEAD(&local->services); + init_rwsem(&local->defrag_sem); + skb_queue_head_init(&local->accept_queue); + skb_queue_head_init(&local->reject_queue); + skb_queue_head_init(&local->event_queue); + local->client_conns = RB_ROOT; + spin_lock_init(&local->client_conns_lock); + spin_lock_init(&local->lock); + rwlock_init(&local->services_lock); + local->debug_id = atomic_inc_return(&rxrpc_debug_id); + memcpy(&local->srx, srx, sizeof(*srx)); + } + + _leave(" = %p", local); + return local; +} + +/* + * create the local socket + * - must be called with rxrpc_local_mutex locked + */ +static int rxrpc_open_socket(struct rxrpc_local *local) +{ + struct sock *sock; + int ret, opt; + + _enter("%p{%d}", local, local->srx.transport_type); + + /* create a socket to represent the local endpoint */ + ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, + IPPROTO_UDP, &local->socket); + if (ret < 0) { + _leave(" = %d [socket]", ret); + return ret; + } + + /* if a local address was supplied then bind it */ + if (local->srx.transport_len > sizeof(sa_family_t)) { + _debug("bind"); + ret = kernel_bind(local->socket, + (struct sockaddr *)&local->srx.transport, + local->srx.transport_len); + if (ret < 0) { + _debug("bind failed %d", ret); + goto error; + } + } + + /* we want to receive ICMP errors */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + /* we want to set the don't fragment bit */ + opt = IP_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + /* set the socket up */ + sock = local->socket->sk; + sock->sk_user_data = local; + sock->sk_data_ready = rxrpc_data_ready; + sock->sk_error_report = rxrpc_error_report; + _leave(" = 0"); + return 0; + +error: + kernel_sock_shutdown(local->socket, SHUT_RDWR); + local->socket->sk->sk_user_data = NULL; + sock_release(local->socket); + local->socket = NULL; + + _leave(" = %d", ret); + return ret; +} + +/* + * Look up or create a new local endpoint using the specified local address. + */ +struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) +{ + struct rxrpc_local *local; + struct list_head *cursor; + const char *age; + long diff; + int ret; + + if (srx->transport.family == AF_INET) { + _enter("{%d,%u,%pI4+%hu}", + srx->transport_type, + srx->transport.family, + &srx->transport.sin.sin_addr, + ntohs(srx->transport.sin.sin_port)); + } else { + _enter("{%d,%u}", + srx->transport_type, + srx->transport.family); + return ERR_PTR(-EAFNOSUPPORT); + } + + mutex_lock(&rxrpc_local_mutex); + + for (cursor = rxrpc_local_endpoints.next; + cursor != &rxrpc_local_endpoints; + cursor = cursor->next) { + local = list_entry(cursor, struct rxrpc_local, link); + + diff = rxrpc_local_cmp_key(local, srx); + if (diff < 0) + continue; + if (diff > 0) + break; + + /* Services aren't allowed to share transport sockets, so + * reject that here. It is possible that the object is dying - + * but it may also still have the local transport address that + * we want bound. + */ + if (srx->srx_service) { + local = NULL; + goto addr_in_use; + } + + /* Found a match. We replace a dying object. Attempting to + * bind the transport socket may still fail if we're attempting + * to use a local address that the dying object is still using. + */ + if (!rxrpc_get_local_maybe(local)) { + cursor = cursor->next; + list_del_init(&local->link); + break; + } + + age = "old"; + goto found; + } + + local = rxrpc_alloc_local(srx); + if (!local) + goto nomem; + + ret = rxrpc_open_socket(local); + if (ret < 0) + goto sock_error; + + list_add_tail(&local->link, cursor); + age = "new"; + +found: + mutex_unlock(&rxrpc_local_mutex); + + _net("LOCAL %s %d {%d,%u,%pI4+%hu}", + age, + local->debug_id, + local->srx.transport_type, + local->srx.transport.family, + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port)); + + _leave(" = %p", local); + return local; + +nomem: + ret = -ENOMEM; +sock_error: + mutex_unlock(&rxrpc_local_mutex); + kfree(local); + _leave(" = %d", ret); + return ERR_PTR(ret); + +addr_in_use: + mutex_unlock(&rxrpc_local_mutex); + _leave(" = -EADDRINUSE"); + return ERR_PTR(-EADDRINUSE); +} + +/* + * A local endpoint reached its end of life. + */ +void __rxrpc_put_local(struct rxrpc_local *local) +{ + _enter("%d", local->debug_id); + rxrpc_queue_work(&local->processor); +} + +/* + * Destroy a local endpoint's socket and then hand the record to RCU to dispose + * of. + * + * Closing the socket cannot be done from bottom half context or RCU callback + * context because it might sleep. + */ +static void rxrpc_local_destroyer(struct rxrpc_local *local) +{ + struct socket *socket = local->socket; + + _enter("%d", local->debug_id); + + /* We can get a race between an incoming call packet queueing the + * processor again and the work processor starting the destruction + * process which will shut down the UDP socket. + */ + if (local->dead) { + _leave(" [already dead]"); + return; + } + local->dead = true; + + mutex_lock(&rxrpc_local_mutex); + list_del_init(&local->link); + mutex_unlock(&rxrpc_local_mutex); + + ASSERT(RB_EMPTY_ROOT(&local->client_conns)); + ASSERT(list_empty(&local->services)); + + if (socket) { + local->socket = NULL; + kernel_sock_shutdown(socket, SHUT_RDWR); + socket->sk->sk_user_data = NULL; + sock_release(socket); + } + + /* At this point, there should be no more packets coming in to the + * local endpoint. + */ + rxrpc_purge_queue(&local->accept_queue); + rxrpc_purge_queue(&local->reject_queue); + rxrpc_purge_queue(&local->event_queue); + + _debug("rcu local %d", local->debug_id); + call_rcu(&local->rcu, rxrpc_local_rcu); +} + +/* + * Process events on an endpoint + */ +static void rxrpc_local_processor(struct work_struct *work) +{ + struct rxrpc_local *local = + container_of(work, struct rxrpc_local, processor); + bool again; + + _enter("%d", local->debug_id); + + do { + again = false; + if (atomic_read(&local->usage) == 0) + return rxrpc_local_destroyer(local); + + if (!skb_queue_empty(&local->accept_queue)) { + rxrpc_accept_incoming_calls(local); + again = true; + } + + if (!skb_queue_empty(&local->reject_queue)) { + rxrpc_reject_packets(local); + again = true; + } + + if (!skb_queue_empty(&local->event_queue)) { + rxrpc_process_local_events(local); + again = true; + } + } while (again); +} + +/* + * Destroy a local endpoint after the RCU grace period expires. + */ +static void rxrpc_local_rcu(struct rcu_head *rcu) +{ + struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu); + + _enter("%d", local->debug_id); + + ASSERT(!work_pending(&local->processor)); + + _net("DESTROY LOCAL %d", local->debug_id); + kfree(local); + _leave(""); +} + +/* + * Verify the local endpoint list is empty by this point. + */ +void __exit rxrpc_destroy_all_locals(void) +{ + struct rxrpc_local *local; + + _enter(""); + + flush_workqueue(rxrpc_workqueue); + + if (!list_empty(&rxrpc_local_endpoints)) { + mutex_lock(&rxrpc_local_mutex); + list_for_each_entry(local, &rxrpc_local_endpoints, link) { + pr_err("AF_RXRPC: Leaked local %p {%d}\n", + local, atomic_read(&local->usage)); + } + mutex_unlock(&rxrpc_local_mutex); + BUG(); + } + + rcu_barrier(); +} diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 1afe9876e..bdc5e42fe 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -14,6 +14,12 @@ #include #include "ar-internal.h" +/* + * The maximum listening backlog queue size that may be set on a socket by + * listen(). + */ +unsigned int rxrpc_max_backlog __read_mostly = 10; + /* * How long to wait before scheduling ACK generation after seeing a * packet with RXRPC_REQUEST_ACK set (in jiffies). diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c new file mode 100644 index 000000000..f4bda06b7 --- /dev/null +++ b/net/rxrpc/output.c @@ -0,0 +1,721 @@ +/* RxRPC packet transmission + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * Time till packet resend (in jiffies). + */ +unsigned int rxrpc_resend_timeout = 4 * HZ; + +static int rxrpc_send_data(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len); + +/* + * extract control messages from the sendmsg() control buffer + */ +static int rxrpc_sendmsg_cmsg(struct msghdr *msg, + unsigned long *user_call_ID, + enum rxrpc_command *command, + u32 *abort_code, + bool *_exclusive) +{ + struct cmsghdr *cmsg; + bool got_user_ID = false; + int len; + + *command = RXRPC_CMD_SEND_DATA; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_USER_CALL_ID: + if (msg->msg_flags & MSG_CMSG_COMPAT) { + if (len != sizeof(u32)) + return -EINVAL; + *user_call_ID = *(u32 *) CMSG_DATA(cmsg); + } else { + if (len != sizeof(unsigned long)) + return -EINVAL; + *user_call_ID = *(unsigned long *) + CMSG_DATA(cmsg); + } + _debug("User Call ID %lx", *user_call_ID); + got_user_ID = true; + break; + + case RXRPC_ABORT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_SEND_ABORT; + if (len != sizeof(*abort_code)) + return -EINVAL; + *abort_code = *(unsigned int *) CMSG_DATA(cmsg); + _debug("Abort %x", *abort_code); + if (*abort_code == 0) + return -EINVAL; + break; + + case RXRPC_ACCEPT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_ACCEPT; + if (len != 0) + return -EINVAL; + break; + + case RXRPC_EXCLUSIVE_CALL: + *_exclusive = true; + if (len != 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + if (!got_user_ID) + return -EINVAL; + _leave(" = 0"); + return 0; +} + +/* + * abort a call, sending an ABORT packet to the peer + */ +static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) +{ + write_lock_bh(&call->state_lock); + + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = abort_code; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + rxrpc_queue_call(call); + } + + write_unlock_bh(&call->state_lock); +} + +/* + * Create a new client call for sendmsg(). + */ +static struct rxrpc_call * +rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, + unsigned long user_call_ID, bool exclusive) +{ + struct rxrpc_conn_parameters cp; + struct rxrpc_call *call; + struct key *key; + + DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); + + _enter(""); + + if (!msg->msg_name) + return ERR_PTR(-EDESTADDRREQ); + + key = rx->key; + if (key && !rx->key->payload.data[0]) + key = NULL; + + memset(&cp, 0, sizeof(cp)); + cp.local = rx->local; + cp.key = rx->key; + cp.security_level = rx->min_sec_level; + cp.exclusive = rx->exclusive | exclusive; + cp.service_id = srx->srx_service; + call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); + + _leave(" = %p\n", call); + return call; +} + +/* + * send a message forming part of a client call through an RxRPC socket + * - caller holds the socket locked + * - the socket may be either a client socket or a server socket + */ +int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + enum rxrpc_command cmd; + struct rxrpc_call *call; + unsigned long user_call_ID = 0; + bool exclusive = false; + u32 abort_code = 0; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, + &exclusive); + if (ret < 0) + return ret; + + if (cmd == RXRPC_CMD_ACCEPT) { + if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) + return -EINVAL; + call = rxrpc_accept_call(rx, user_call_ID); + if (IS_ERR(call)) + return PTR_ERR(call); + rxrpc_put_call(call); + return 0; + } + + call = rxrpc_find_call_by_user_ID(rx, user_call_ID); + if (!call) { + if (cmd != RXRPC_CMD_SEND_DATA) + return -EBADSLT; + call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, + exclusive); + if (IS_ERR(call)) + return PTR_ERR(call); + } + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + /* it's too late for this call */ + ret = -ECONNRESET; + } else if (cmd == RXRPC_CMD_SEND_ABORT) { + rxrpc_send_abort(call, abort_code); + ret = 0; + } else if (cmd != RXRPC_CMD_SEND_DATA) { + ret = -EINVAL; + } else if (!call->in_clientflag && + call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { + /* request phase complete for this client call */ + ret = -EPROTO; + } else if (call->in_clientflag && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + /* Reply phase not begun or not complete for service call. */ + ret = -EPROTO; + } else { + ret = rxrpc_send_data(rx, call, msg, len); + } + + rxrpc_put_call(call); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_send_data - Allow a kernel service to send data on a call + * @call: The call to send data through + * @msg: The data to send + * @len: The amount of data to send + * + * Allow a kernel service to send data on a call. The call must be in an state + * appropriate to sending data. No control data should be supplied in @msg, + * nor should an address be supplied. MSG_MORE should be flagged if there's + * more data to come, otherwise this data will end the transmission phase. + */ +int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, + size_t len) +{ + int ret; + + _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); + + ASSERTCMP(msg->msg_name, ==, NULL); + ASSERTCMP(msg->msg_control, ==, NULL); + + lock_sock(&call->socket->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -ESHUTDOWN; /* it's too late for this call */ + } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + ret = -EPROTO; /* request phase complete for this client call */ + } else { + ret = rxrpc_send_data(call->socket, call, msg, len); + } + + release_sock(&call->socket->sk); + _leave(" = %d", ret); + return ret; +} + +EXPORT_SYMBOL(rxrpc_kernel_send_data); + +/** + * rxrpc_kernel_abort_call - Allow a kernel service to abort a call + * @call: The call to be aborted + * @abort_code: The abort code to stick into the ABORT packet + * + * Allow a kernel service to abort a call, if it's still in an abortable state. + */ +void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code) +{ + _enter("{%d},%d", call->debug_id, abort_code); + + lock_sock(&call->socket->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state < RXRPC_CALL_COMPLETE) + rxrpc_send_abort(call, abort_code); + + release_sock(&call->socket->sk); + _leave(""); +} + +EXPORT_SYMBOL(rxrpc_kernel_abort_call); + +/* + * send a packet through the transport endpoint + */ +int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + struct kvec iov[1]; + struct msghdr msg; + int ret, opt; + + _enter(",{%d}", skb->len); + + iov[0].iov_base = skb->head; + iov[0].iov_len = skb->len; + + msg.msg_name = &conn->params.peer->srx.transport; + msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + /* send the packet with the don't fragment bit set if we currently + * think it's small enough */ + if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) { + down_read(&conn->params.local->defrag_sem); + /* send the packet by UDP + * - returns -EMSGSIZE if UDP would have to fragment the packet + * to go out of the interface + * - in which case, we'll have processed the ICMP error + * message and update the peer record + */ + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, + iov[0].iov_len); + + up_read(&conn->params.local->defrag_sem); + if (ret == -EMSGSIZE) + goto send_fragmentable; + + _leave(" = %d [%u]", ret, conn->params.peer->maxdata); + return ret; + } + +send_fragmentable: + /* attempt to send this message with fragmentation enabled */ + _debug("send fragment"); + + down_write(&conn->params.local->defrag_sem); + + switch (conn->params.local->srx.transport.family) { + case AF_INET: + opt = IP_PMTUDISC_DONT; + ret = kernel_setsockopt(conn->params.local->socket, + SOL_IP, IP_MTU_DISCOVER, + (char *)&opt, sizeof(opt)); + if (ret == 0) { + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, + iov[0].iov_len); + + opt = IP_PMTUDISC_DO; + kernel_setsockopt(conn->params.local->socket, SOL_IP, + IP_MTU_DISCOVER, + (char *)&opt, sizeof(opt)); + } + break; + } + + up_write(&conn->params.local->defrag_sem); + _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata); + return ret; +} + +/* + * wait for space to appear in the transmit/ACK window + * - caller holds the socket locked + */ +static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + DECLARE_WAITQUEUE(myself, current); + int ret; + + _enter(",{%d},%ld", + CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), + call->acks_winsz), + *timeo); + + add_wait_queue(&call->tx_waitq, &myself); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + ret = 0; + if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), + call->acks_winsz) > 0) + break; + if (signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + + release_sock(&rx->sk); + *timeo = schedule_timeout(*timeo); + lock_sock(&rx->sk); + } + + remove_wait_queue(&call->tx_waitq, &myself); + set_current_state(TASK_RUNNING); + _leave(" = %d", ret); + return ret; +} + +/* + * attempt to schedule an instant Tx resend + */ +static inline void rxrpc_instant_resend(struct rxrpc_call *call) +{ + read_lock_bh(&call->state_lock); + if (try_to_del_timer_sync(&call->resend_timer) >= 0) { + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); +} + +/* + * queue a packet for transmission, set the resend timer and attempt + * to send the packet immediately + */ +static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, + bool last) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + int ret; + + _net("queue skb %p [%d]", skb, call->acks_head); + + ASSERT(call->acks_window != NULL); + call->acks_window[call->acks_head] = (unsigned long) skb; + smp_wmb(); + call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1); + + if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { + _debug("________awaiting reply/ACK__________"); + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + break; + case RXRPC_CALL_SERVER_ACK_REQUEST: + call->state = RXRPC_CALL_SERVER_SEND_REPLY; + if (!last) + break; + case RXRPC_CALL_SERVER_SEND_REPLY: + call->state = RXRPC_CALL_SERVER_AWAIT_ACK; + break; + default: + break; + } + write_unlock_bh(&call->state_lock); + } + + _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); + + sp->need_resend = false; + sp->resend_at = jiffies + rxrpc_resend_timeout; + if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { + _debug("run timer"); + call->resend_timer.expires = sp->resend_at; + add_timer(&call->resend_timer); + } + + /* attempt to cancel the rx-ACK timer, deferring reply transmission if + * we're ACK'ing the request phase of an incoming call */ + ret = -EAGAIN; + if (try_to_del_timer_sync(&call->ack_timer) >= 0) { + /* the packet may be freed by rxrpc_process_call() before this + * returns */ + ret = rxrpc_send_data_packet(call->conn, skb); + _net("sent skb %p", skb); + } else { + _debug("failed to delete ACK timer"); + } + + if (ret < 0) { + _debug("need instant resend %d", ret); + sp->need_resend = true; + rxrpc_instant_resend(call); + } + + _leave(""); +} + +/* + * Convert a host-endian header into a network-endian header. + */ +static void rxrpc_insert_header(struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = htonl(sp->hdr.serial); + whdr.type = sp->hdr.type; + whdr.flags = sp->hdr.flags; + whdr.userStatus = sp->hdr.userStatus; + whdr.securityIndex = sp->hdr.securityIndex; + whdr._rsvd = htons(sp->hdr._rsvd); + whdr.serviceId = htons(sp->hdr.serviceId); + + memcpy(skb->head, &whdr, sizeof(whdr)); +} + +/* + * send data through a socket + * - must be called in process context + * - caller holds the socket locked + */ +static int rxrpc_send_data(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + struct sock *sk = &rx->sk; + long timeo; + bool more; + int ret, copied; + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + /* this should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + return -EPIPE; + + more = msg->msg_flags & MSG_MORE; + + skb = call->tx_pending; + call->tx_pending = NULL; + + copied = 0; + do { + if (!skb) { + size_t size, chunk, max, space; + + _debug("alloc"); + + if (CIRC_SPACE(call->acks_head, + ACCESS_ONCE(call->acks_tail), + call->acks_winsz) <= 0) { + ret = -EAGAIN; + if (msg->msg_flags & MSG_DONTWAIT) + goto maybe_error; + ret = rxrpc_wait_for_tx_window(rx, call, + &timeo); + if (ret < 0) + goto maybe_error; + } + + max = call->conn->params.peer->maxdata; + max -= call->conn->security_size; + max &= ~(call->conn->size_align - 1UL); + + chunk = max; + if (chunk > msg_data_left(msg) && !more) + chunk = msg_data_left(msg); + + space = chunk + call->conn->size_align; + space &= ~(call->conn->size_align - 1UL); + + size = space + call->conn->header_size; + + _debug("SIZE: %zu/%zu/%zu", chunk, space, size); + + /* create a buffer that we can retain until it's ACK'd */ + skb = sock_alloc_send_skb( + sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); + if (!skb) + goto maybe_error; + + rxrpc_new_skb(skb); + + _debug("ALLOC SEND %p", skb); + + ASSERTCMP(skb->mark, ==, 0); + + _debug("HS: %u", call->conn->header_size); + skb_reserve(skb, call->conn->header_size); + skb->len += call->conn->header_size; + + sp = rxrpc_skb(skb); + sp->remain = chunk; + if (sp->remain > skb_tailroom(skb)) + sp->remain = skb_tailroom(skb); + + _net("skb: hr %d, tr %d, hl %d, rm %d", + skb_headroom(skb), + skb_tailroom(skb), + skb_headlen(skb), + sp->remain); + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + _debug("append"); + sp = rxrpc_skb(skb); + + /* append next segment of data to the current buffer */ + if (msg_data_left(msg) > 0) { + int copy = skb_tailroom(skb); + ASSERTCMP(copy, >, 0); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + if (copy > sp->remain) + copy = sp->remain; + + _debug("add"); + ret = skb_add_data(skb, &msg->msg_iter, copy); + _debug("added"); + if (ret < 0) + goto efault; + sp->remain -= copy; + skb->mark += copy; + copied += copy; + } + + /* check for the far side aborting the call or a network error + * occurring */ + if (call->state > RXRPC_CALL_COMPLETE) + goto call_aborted; + + /* add the packet to the send queue if it's now full */ + if (sp->remain <= 0 || + (msg_data_left(msg) == 0 && !more)) { + struct rxrpc_connection *conn = call->conn; + uint32_t seq; + size_t pad; + + /* pad out if we're using security */ + if (conn->security_ix) { + pad = conn->security_size + skb->mark; + pad = conn->size_align - pad; + pad &= conn->size_align - 1; + _debug("pad %zu", pad); + if (pad) + memset(skb_put(skb, pad), 0, pad); + } + + seq = atomic_inc_return(&call->sequence); + + sp->hdr.epoch = conn->proto.epoch; + sp->hdr.cid = call->cid; + sp->hdr.callNumber = call->call_id; + sp->hdr.seq = seq; + sp->hdr.serial = atomic_inc_return(&conn->serial); + sp->hdr.type = RXRPC_PACKET_TYPE_DATA; + sp->hdr.userStatus = 0; + sp->hdr.securityIndex = conn->security_ix; + sp->hdr._rsvd = 0; + sp->hdr.serviceId = call->service_id; + + sp->hdr.flags = conn->out_clientflag; + if (msg_data_left(msg) == 0 && !more) + sp->hdr.flags |= RXRPC_LAST_PACKET; + else if (CIRC_SPACE(call->acks_head, + ACCESS_ONCE(call->acks_tail), + call->acks_winsz) > 1) + sp->hdr.flags |= RXRPC_MORE_PACKETS; + if (more && seq & 1) + sp->hdr.flags |= RXRPC_REQUEST_ACK; + + ret = conn->security->secure_packet( + call, skb, skb->mark, + skb->head + sizeof(struct rxrpc_wire_header)); + if (ret < 0) + goto out; + + rxrpc_insert_header(skb); + rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); + skb = NULL; + } + } while (msg_data_left(msg) > 0); + +success: + ret = copied; +out: + call->tx_pending = skb; + _leave(" = %d", ret); + return ret; + +call_aborted: + rxrpc_free_skb(skb); + if (call->state == RXRPC_CALL_NETWORK_ERROR) + ret = call->error_report < RXRPC_LOCAL_ERROR_OFFSET ? + call->error_report : + call->error_report - RXRPC_LOCAL_ERROR_OFFSET; + else + ret = -ECONNABORTED; + _leave(" = %d", ret); + return ret; + +maybe_error: + if (copied) + goto success; + goto out; + +efault: + ret = -EFAULT; + goto out; +} diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c new file mode 100644 index 000000000..8940674b5 --- /dev/null +++ b/net/rxrpc/peer_event.c @@ -0,0 +1,281 @@ +/* Peer event handling, typically ICMP messages. + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); + +/* + * Find the peer associated with an ICMP packet. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, + const struct sk_buff *skb) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct sockaddr_rxrpc srx; + + _enter(""); + + memset(&srx, 0, sizeof(srx)); + srx.transport_type = local->srx.transport_type; + srx.transport.family = local->srx.transport.family; + + /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice + * versa? + */ + switch (srx.transport.family) { + case AF_INET: + srx.transport.sin.sin_port = serr->port; + srx.transport_len = sizeof(struct sockaddr_in); + switch (serr->ee.ee_origin) { + case SO_EE_ORIGIN_ICMP: + _net("Rx ICMP"); + memcpy(&srx.transport.sin.sin_addr, + skb_network_header(skb) + serr->addr_offset, + sizeof(struct in_addr)); + break; + case SO_EE_ORIGIN_ICMP6: + _net("Rx ICMP6 on v4 sock"); + memcpy(&srx.transport.sin.sin_addr, + skb_network_header(skb) + serr->addr_offset + 12, + sizeof(struct in_addr)); + break; + default: + memcpy(&srx.transport.sin.sin_addr, &ip_hdr(skb)->saddr, + sizeof(struct in_addr)); + break; + } + break; + + default: + BUG(); + } + + return rxrpc_lookup_peer_rcu(local, &srx); +} + +/* + * Handle an MTU/fragmentation problem. + */ +static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) +{ + u32 mtu = serr->ee.ee_info; + + _net("Rx ICMP Fragmentation Needed (%d)", mtu); + + /* wind down the local interface MTU */ + if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { + peer->if_mtu = mtu; + _net("I/F MTU %u", mtu); + } + + if (mtu == 0) { + /* they didn't give us a size, estimate one */ + mtu = peer->if_mtu; + if (mtu > 1500) { + mtu >>= 1; + if (mtu < 1500) + mtu = 1500; + } else { + mtu -= 100; + if (mtu < peer->hdrsize) + mtu = peer->hdrsize + 4; + } + } + + if (mtu < peer->mtu) { + spin_lock_bh(&peer->lock); + peer->mtu = mtu; + peer->maxdata = peer->mtu - peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", + peer->mtu, peer->maxdata); + } +} + +/* + * Handle an error received on the local endpoint. + */ +void rxrpc_error_report(struct sock *sk) +{ + struct sock_exterr_skb *serr; + struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_peer *peer; + struct sk_buff *skb; + + _enter("%p{%d}", sk, local->debug_id); + + skb = sock_dequeue_err_skb(sk); + if (!skb) { + _leave("UDP socket errqueue empty"); + return; + } + serr = SKB_EXT_ERR(skb); + if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { + _leave("UDP empty message"); + kfree_skb(skb); + return; + } + + rxrpc_new_skb(skb); + + rcu_read_lock(); + peer = rxrpc_lookup_peer_icmp_rcu(local, skb); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + rcu_read_unlock(); + rxrpc_free_skb(skb); + _leave(" [no peer]"); + return; + } + + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && + serr->ee.ee_type == ICMP_DEST_UNREACH && + serr->ee.ee_code == ICMP_FRAG_NEEDED)) { + rxrpc_adjust_mtu(peer, serr); + rcu_read_unlock(); + rxrpc_free_skb(skb); + rxrpc_put_peer(peer); + _leave(" [MTU update]"); + return; + } + + rxrpc_store_error(peer, serr); + rcu_read_unlock(); + rxrpc_free_skb(skb); + + /* The ref we obtained is passed off to the work item */ + rxrpc_queue_work(&peer->error_distributor); + _leave(""); +} + +/* + * Map an error report to error codes on the peer record. + */ +static void rxrpc_store_error(struct rxrpc_peer *peer, + struct sock_exterr_skb *serr) +{ + struct sock_extended_err *ee; + int err; + + _enter(""); + + ee = &serr->ee; + + _net("Rx Error o=%d t=%d c=%d e=%d", + ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno); + + err = ee->ee_errno; + + switch (ee->ee_origin) { + case SO_EE_ORIGIN_ICMP: + switch (ee->ee_type) { + case ICMP_DEST_UNREACH: + switch (ee->ee_code) { + case ICMP_NET_UNREACH: + _net("Rx Received ICMP Network Unreachable"); + break; + case ICMP_HOST_UNREACH: + _net("Rx Received ICMP Host Unreachable"); + break; + case ICMP_PORT_UNREACH: + _net("Rx Received ICMP Port Unreachable"); + break; + case ICMP_NET_UNKNOWN: + _net("Rx Received ICMP Unknown Network"); + break; + case ICMP_HOST_UNKNOWN: + _net("Rx Received ICMP Unknown Host"); + break; + default: + _net("Rx Received ICMP DestUnreach code=%u", + ee->ee_code); + break; + } + break; + + case ICMP_TIME_EXCEEDED: + _net("Rx Received ICMP TTL Exceeded"); + break; + + default: + _proto("Rx Received ICMP error { type=%u code=%u }", + ee->ee_type, ee->ee_code); + break; + } + break; + + case SO_EE_ORIGIN_NONE: + case SO_EE_ORIGIN_LOCAL: + _proto("Rx Received local error { error=%d }", err); + err += RXRPC_LOCAL_ERROR_OFFSET; + break; + + case SO_EE_ORIGIN_ICMP6: + default: + _proto("Rx Received error report { orig=%u }", ee->ee_origin); + break; + } + + peer->error_report = err; +} + +/* + * Distribute an error that occurred on a peer + */ +void rxrpc_peer_error_distributor(struct work_struct *work) +{ + struct rxrpc_peer *peer = + container_of(work, struct rxrpc_peer, error_distributor); + struct rxrpc_call *call; + int error_report; + + _enter(""); + + error_report = READ_ONCE(peer->error_report); + + _debug("ISSUE ERROR %d", error_report); + + spin_lock_bh(&peer->lock); + + while (!hlist_empty(&peer->error_targets)) { + call = hlist_entry(peer->error_targets.first, + struct rxrpc_call, error_link); + hlist_del_init(&call->error_link); + + write_lock(&call->state_lock); + if (call->state != RXRPC_CALL_COMPLETE && + call->state < RXRPC_CALL_NETWORK_ERROR) { + call->error_report = error_report; + call->state = RXRPC_CALL_NETWORK_ERROR; + set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + } + + spin_unlock_bh(&peer->lock); + + rxrpc_put_peer(peer); + _leave(""); +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c new file mode 100644 index 000000000..538e9831c --- /dev/null +++ b/net/rxrpc/peer_object.c @@ -0,0 +1,315 @@ +/* RxRPC remote transport endpoint record management + * + * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static DEFINE_HASHTABLE(rxrpc_peer_hash, 10); +static DEFINE_SPINLOCK(rxrpc_peer_hash_lock); + +/* + * Hash a peer key. + */ +static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx) +{ + const u16 *p; + unsigned int i, size; + unsigned long hash_key; + + _enter(""); + + hash_key = (unsigned long)local / __alignof__(*local); + hash_key += srx->transport_type; + hash_key += srx->transport_len; + hash_key += srx->transport.family; + + switch (srx->transport.family) { + case AF_INET: + hash_key += (u16 __force)srx->transport.sin.sin_port; + size = sizeof(srx->transport.sin.sin_addr); + p = (u16 *)&srx->transport.sin.sin_addr; + break; + default: + WARN(1, "AF_RXRPC: Unsupported transport address family\n"); + return 0; + } + + /* Step through the peer address in 16-bit portions for speed */ + for (i = 0; i < size; i += sizeof(*p), p++) + hash_key += *p; + + _leave(" 0x%lx", hash_key); + return hash_key; +} + +/* + * Compare a peer to a key. Return -ve, 0 or +ve to indicate less than, same + * or greater than. + * + * Unfortunately, the primitives in linux/hashtable.h don't allow for sorted + * buckets and mid-bucket insertion, so we don't make full use of this + * information at this point. + */ +static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer, + struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx, + unsigned long hash_key) +{ + long diff; + + diff = ((peer->hash_key - hash_key) ?: + ((unsigned long)peer->local - (unsigned long)local) ?: + (peer->srx.transport_type - srx->transport_type) ?: + (peer->srx.transport_len - srx->transport_len) ?: + (peer->srx.transport.family - srx->transport.family)); + if (diff != 0) + return diff; + + switch (srx->transport.family) { + case AF_INET: + return ((u16 __force)peer->srx.transport.sin.sin_port - + (u16 __force)srx->transport.sin.sin_port) ?: + memcmp(&peer->srx.transport.sin.sin_addr, + &srx->transport.sin.sin_addr, + sizeof(struct in_addr)); + default: + BUG(); + } +} + +/* + * Look up a remote transport endpoint for the specified address using RCU. + */ +static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( + struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx, + unsigned long hash_key) +{ + struct rxrpc_peer *peer; + + hash_for_each_possible_rcu(rxrpc_peer_hash, peer, hash_link, hash_key) { + if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) { + if (atomic_read(&peer->usage) == 0) + return NULL; + return peer; + } + } + + return NULL; +} + +/* + * Look up a remote transport endpoint for the specified address using RCU. + */ +struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx) +{ + struct rxrpc_peer *peer; + unsigned long hash_key = rxrpc_peer_hash_key(local, srx); + + peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); + if (peer) { + switch (srx->transport.family) { + case AF_INET: + _net("PEER %d {%d,%u,%pI4+%hu}", + peer->debug_id, + peer->srx.transport_type, + peer->srx.transport.family, + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + break; + } + + _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); + } + return peer; +} + +/* + * assess the MTU size for the network interface through which this peer is + * reached + */ +static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) +{ + struct rtable *rt; + struct flowi4 fl4; + + peer->if_mtu = 1500; + + rt = ip_route_output_ports(&init_net, &fl4, NULL, + peer->srx.transport.sin.sin_addr.s_addr, 0, + htons(7000), htons(7001), + IPPROTO_UDP, 0, 0); + if (IS_ERR(rt)) { + _leave(" [route err %ld]", PTR_ERR(rt)); + return; + } + + peer->if_mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); + + _leave(" [if_mtu %u]", peer->if_mtu); +} + +/* + * Allocate a peer. + */ +struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) +{ + struct rxrpc_peer *peer; + + _enter(""); + + peer = kzalloc(sizeof(struct rxrpc_peer), gfp); + if (peer) { + atomic_set(&peer->usage, 1); + peer->local = local; + INIT_HLIST_HEAD(&peer->error_targets); + INIT_WORK(&peer->error_distributor, + &rxrpc_peer_error_distributor); + peer->service_conns = RB_ROOT; + seqlock_init(&peer->service_conn_lock); + spin_lock_init(&peer->lock); + peer->debug_id = atomic_inc_return(&rxrpc_debug_id); + } + + _leave(" = %p", peer); + return peer; +} + +/* + * Set up a new peer. + */ +static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, + unsigned long hash_key, + gfp_t gfp) +{ + struct rxrpc_peer *peer; + + _enter(""); + + peer = rxrpc_alloc_peer(local, gfp); + if (peer) { + peer->hash_key = hash_key; + memcpy(&peer->srx, srx, sizeof(*srx)); + + rxrpc_assess_MTU_size(peer); + peer->mtu = peer->if_mtu; + + if (srx->transport.family == AF_INET) { + peer->hdrsize = sizeof(struct iphdr); + switch (srx->transport_type) { + case SOCK_DGRAM: + peer->hdrsize += sizeof(struct udphdr); + break; + default: + BUG(); + break; + } + } else { + BUG(); + } + + peer->hdrsize += sizeof(struct rxrpc_wire_header); + peer->maxdata = peer->mtu - peer->hdrsize; + } + + _leave(" = %p", peer); + return peer; +} + +/* + * obtain a remote transport endpoint for the specified address + */ +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, gfp_t gfp) +{ + struct rxrpc_peer *peer, *candidate; + unsigned long hash_key = rxrpc_peer_hash_key(local, srx); + + _enter("{%d,%d,%pI4+%hu}", + srx->transport_type, + srx->transport_len, + &srx->transport.sin.sin_addr, + ntohs(srx->transport.sin.sin_port)); + + /* search the peer list first */ + rcu_read_lock(); + peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + rcu_read_unlock(); + + if (!peer) { + /* The peer is not yet present in hash - create a candidate + * for a new record and then redo the search. + */ + candidate = rxrpc_create_peer(local, srx, hash_key, gfp); + if (!candidate) { + _leave(" = NULL [nomem]"); + return NULL; + } + + spin_lock(&rxrpc_peer_hash_lock); + + /* Need to check that we aren't racing with someone else */ + peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) + hash_add_rcu(rxrpc_peer_hash, + &candidate->hash_link, hash_key); + + spin_unlock(&rxrpc_peer_hash_lock); + + if (peer) + kfree(candidate); + else + peer = candidate; + } + + _net("PEER %d {%d,%pI4+%hu}", + peer->debug_id, + peer->srx.transport_type, + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + + _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); + return peer; +} + +/* + * Discard a ref on a remote peer record. + */ +void __rxrpc_put_peer(struct rxrpc_peer *peer) +{ + ASSERT(hlist_empty(&peer->error_targets)); + + spin_lock(&rxrpc_peer_hash_lock); + hash_del_rcu(&peer->hash_link); + spin_unlock(&rxrpc_peer_hash_lock); + + kfree_rcu(peer, rcu); +} diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c new file mode 100644 index 000000000..ced5f0744 --- /dev/null +++ b/net/rxrpc/proc.c @@ -0,0 +1,192 @@ +/* /proc/net/ support for AF_RXRPC + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "ar-internal.h" + +static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { + [RXRPC_CONN_UNUSED] = "Unused ", + [RXRPC_CONN_CLIENT] = "Client ", + [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ", + [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ", + [RXRPC_CONN_SERVICE] = "SvSecure", + [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CONN_NETWORK_ERROR] = "NetError", +}; + +/* + * generate a list of extant and dead calls in /proc/net/rxrpc_calls + */ +static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) +{ + read_lock(&rxrpc_call_lock); + return seq_list_start_head(&rxrpc_calls, *_pos); +} + +static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &rxrpc_calls, pos); +} + +static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&rxrpc_call_lock); +} + +static int rxrpc_call_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_connection *conn; + struct rxrpc_call *call; + char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + + if (v == &rxrpc_calls) { + seq_puts(seq, + "Proto Local Remote " + " SvID ConnID CallID End Use State Abort " + " UserID\n"); + return 0; + } + + call = list_entry(v, struct rxrpc_call, link); + + sprintf(lbuff, "%pI4:%u", + &call->local->srx.transport.sin.sin_addr, + ntohs(call->local->srx.transport.sin.sin_port)); + + conn = call->conn; + if (conn) + sprintf(rbuff, "%pI4:%u", + &conn->params.peer->srx.transport.sin.sin_addr, + ntohs(conn->params.peer->srx.transport.sin.sin_port)); + else + strcpy(rbuff, "no_connection"); + + seq_printf(seq, + "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + " %-8.8s %08x %lx\n", + lbuff, + rbuff, + call->service_id, + call->cid, + call->call_id, + call->in_clientflag ? "Svc" : "Clt", + atomic_read(&call->usage), + rxrpc_call_states[call->state], + call->remote_abort ?: call->local_abort, + call->user_call_ID); + + return 0; +} + +static const struct seq_operations rxrpc_call_seq_ops = { + .start = rxrpc_call_seq_start, + .next = rxrpc_call_seq_next, + .stop = rxrpc_call_seq_stop, + .show = rxrpc_call_seq_show, +}; + +static int rxrpc_call_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rxrpc_call_seq_ops); +} + +const struct file_operations rxrpc_call_seq_fops = { + .owner = THIS_MODULE, + .open = rxrpc_call_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * generate a list of extant virtual connections in /proc/net/rxrpc_conns + */ +static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) +{ + read_lock(&rxrpc_connection_lock); + return seq_list_start_head(&rxrpc_connections, *_pos); +} + +static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + return seq_list_next(v, &rxrpc_connections, pos); +} + +static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&rxrpc_connection_lock); +} + +static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_connection *conn; + char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + + if (v == &rxrpc_connections) { + seq_puts(seq, + "Proto Local Remote " + " SvID ConnID End Use State Key " + " Serial ISerial\n" + ); + return 0; + } + + conn = list_entry(v, struct rxrpc_connection, link); + + sprintf(lbuff, "%pI4:%u", + &conn->params.local->srx.transport.sin.sin_addr, + ntohs(conn->params.local->srx.transport.sin.sin_port)); + + sprintf(rbuff, "%pI4:%u", + &conn->params.peer->srx.transport.sin.sin_addr, + ntohs(conn->params.peer->srx.transport.sin.sin_port)); + + seq_printf(seq, + "UDP %-22.22s %-22.22s %4x %08x %s %3u" + " %s %08x %08x %08x\n", + lbuff, + rbuff, + conn->params.service_id, + conn->proto.cid, + rxrpc_conn_is_service(conn) ? "Svc" : "Clt", + atomic_read(&conn->usage), + rxrpc_conn_states[conn->state], + key_serial(conn->params.key), + atomic_read(&conn->serial), + atomic_read(&conn->hi_serial)); + + return 0; +} + +static const struct seq_operations rxrpc_connection_seq_ops = { + .start = rxrpc_connection_seq_start, + .next = rxrpc_connection_seq_next, + .stop = rxrpc_connection_seq_stop, + .show = rxrpc_connection_seq_show, +}; + + +static int rxrpc_connection_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rxrpc_connection_seq_ops); +} + +const struct file_operations rxrpc_connection_seq_fops = { + .owner = THIS_MODULE, + .open = rxrpc_connection_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c new file mode 100644 index 000000000..9ed66d533 --- /dev/null +++ b/net/rxrpc/recvmsg.c @@ -0,0 +1,417 @@ +/* RxRPC recvmsg() implementation + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * removal a call's user ID from the socket tree to make the user ID available + * again and so that it won't be seen again in association with that call + */ +void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) +{ + _debug("RELEASE CALL %d", call->debug_id); + + if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + write_lock_bh(&rx->call_lock); + rb_erase(&call->sock_node, &call->socket->calls); + clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + write_unlock_bh(&rx->call_lock); + } + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * receive a message from an RxRPC socket + * - we need to be careful about two or more threads calling recvmsg + * simultaneously + */ +int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_call *call = NULL, *continue_call = NULL; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + long timeo; + int copy, ret, ullen, offset, copied = 0; + u32 abort_code; + + DEFINE_WAIT(wait); + + _enter(",,,%zu,%d", len, flags); + + if (flags & (MSG_OOB | MSG_TRUNC)) + return -EOPNOTSUPP; + + ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long); + + timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); + msg->msg_flags |= MSG_MORE; + + lock_sock(&rx->sk); + + for (;;) { + /* return immediately if a client socket has no outstanding + * calls */ + if (RB_EMPTY_ROOT(&rx->calls)) { + if (copied) + goto out; + if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) { + release_sock(&rx->sk); + if (continue_call) + rxrpc_put_call(continue_call); + return -ENODATA; + } + } + + /* get the next message on the Rx queue */ + skb = skb_peek(&rx->sk.sk_receive_queue); + if (!skb) { + /* nothing remains on the queue */ + if (copied && + (flags & MSG_PEEK || timeo == 0)) + goto out; + + /* wait for a message to turn up */ + release_sock(&rx->sk); + prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, + TASK_INTERRUPTIBLE); + ret = sock_error(&rx->sk); + if (ret) + goto wait_error; + + if (skb_queue_empty(&rx->sk.sk_receive_queue)) { + if (signal_pending(current)) + goto wait_interrupted; + timeo = schedule_timeout(timeo); + } + finish_wait(sk_sleep(&rx->sk), &wait); + lock_sock(&rx->sk); + continue; + } + + peek_next_packet: + sp = rxrpc_skb(skb); + call = sp->call; + ASSERT(call != NULL); + + _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); + + /* make sure we wait for the state to be updated in this call */ + spin_lock_bh(&call->lock); + spin_unlock_bh(&call->lock); + + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + _debug("packet from released call"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + continue; + } + + /* determine whether to continue last data receive */ + if (continue_call) { + _debug("maybe cont"); + if (call != continue_call || + skb->mark != RXRPC_SKB_MARK_DATA) { + release_sock(&rx->sk); + rxrpc_put_call(continue_call); + _leave(" = %d [noncont]", copied); + return copied; + } + } + + rxrpc_get_call(call); + + /* copy the peer address and timestamp */ + if (!continue_call) { + if (msg->msg_name) { + size_t len = + sizeof(call->conn->params.peer->srx); + memcpy(msg->msg_name, + &call->conn->params.peer->srx, len); + msg->msg_namelen = len; + } + sock_recv_timestamp(msg, &rx->sk, skb); + } + + /* receive the message */ + if (skb->mark != RXRPC_SKB_MARK_DATA) + goto receive_non_data_message; + + _debug("recvmsg DATA #%u { %d, %d }", + sp->hdr.seq, skb->len, sp->offset); + + if (!continue_call) { + /* only set the control data once per recvmsg() */ + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + ullen, &call->user_call_ID); + if (ret < 0) + goto copy_error; + ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + } + + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + call->rx_data_recv = sp->hdr.seq; + + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); + + offset = sp->offset; + copy = skb->len - offset; + if (copy > len - copied) + copy = len - copied; + + ret = skb_copy_datagram_msg(skb, offset, msg, copy); + + if (ret < 0) + goto copy_error; + + /* handle piecemeal consumption of data packets */ + _debug("copied %d+%d", copy, copied); + + offset += copy; + copied += copy; + + if (!(flags & MSG_PEEK)) + sp->offset = offset; + + if (sp->offset < skb->len) { + _debug("buffer full"); + ASSERTCMP(copied, ==, len); + break; + } + + /* we transferred the whole data packet */ + if (!(flags & MSG_PEEK)) + rxrpc_kernel_data_consumed(call, skb); + + if (sp->hdr.flags & RXRPC_LAST_PACKET) { + _debug("last"); + if (rxrpc_conn_is_client(call->conn)) { + /* last byte of reply received */ + ret = copied; + goto terminal_message; + } + + /* last bit of request received */ + if (!(flags & MSG_PEEK)) { + _debug("eat packet"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != + skb) + BUG(); + rxrpc_free_skb(skb); + } + msg->msg_flags &= ~MSG_MORE; + break; + } + + /* move on to the next data message */ + _debug("next"); + if (!continue_call) + continue_call = sp->call; + else + rxrpc_put_call(call); + call = NULL; + + if (flags & MSG_PEEK) { + _debug("peek next"); + skb = skb->next; + if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue) + break; + goto peek_next_packet; + } + + _debug("eat packet"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + } + + /* end of non-terminal data packet reception for the moment */ + _debug("end rcv data"); +out: + release_sock(&rx->sk); + if (call) + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d [data]", copied); + return copied; + + /* handle non-DATA messages such as aborts, incoming connections and + * final ACKs */ +receive_non_data_message: + _debug("non-data"); + + if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) { + _debug("RECV NEW CALL"); + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code); + if (ret < 0) + goto copy_error; + if (!(flags & MSG_PEEK)) { + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + } + goto out; + } + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + ullen, &call->user_call_ID); + if (ret < 0) + goto copy_error; + ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + + switch (skb->mark) { + case RXRPC_SKB_MARK_DATA: + BUG(); + case RXRPC_SKB_MARK_FINAL_ACK: + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code); + break; + case RXRPC_SKB_MARK_BUSY: + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code); + break; + case RXRPC_SKB_MARK_REMOTE_ABORT: + abort_code = call->remote_abort; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); + break; + case RXRPC_SKB_MARK_LOCAL_ABORT: + abort_code = call->local_abort; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); + break; + case RXRPC_SKB_MARK_NET_ERROR: + _debug("RECV NET ERROR %d", sp->error); + abort_code = sp->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code); + break; + case RXRPC_SKB_MARK_LOCAL_ERROR: + _debug("RECV LOCAL ERROR %d", sp->error); + abort_code = sp->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, + &abort_code); + break; + default: + pr_err("Unknown packet mark %u\n", skb->mark); + BUG(); + break; + } + + if (ret < 0) + goto copy_error; + +terminal_message: + _debug("terminal"); + msg->msg_flags &= ~MSG_MORE; + msg->msg_flags |= MSG_EOR; + + if (!(flags & MSG_PEEK)) { + _net("free terminal skb %p", skb); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + rxrpc_remove_user_ID(rx, call); + } + + release_sock(&rx->sk); + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d", ret); + return ret; + +copy_error: + _debug("copy error"); + release_sock(&rx->sk); + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d", ret); + return ret; + +wait_interrupted: + ret = sock_intr_errno(timeo); +wait_error: + finish_wait(sk_sleep(&rx->sk), &wait); + if (continue_call) + rxrpc_put_call(continue_call); + if (copied) + copied = ret; + _leave(" = %d [waitfail %d]", copied, ret); + return copied; + +} + +/** + * rxrpc_kernel_is_data_last - Determine if data message is last one + * @skb: Message holding data + * + * Determine if data message is last one for the parent call. + */ +bool rxrpc_kernel_is_data_last(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA); + + return sp->hdr.flags & RXRPC_LAST_PACKET; +} + +EXPORT_SYMBOL(rxrpc_kernel_is_data_last); + +/** + * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message + * @skb: Message indicating an abort + * + * Get the abort code from an RxRPC abort message. + */ +u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + switch (skb->mark) { + case RXRPC_SKB_MARK_REMOTE_ABORT: + return sp->call->remote_abort; + case RXRPC_SKB_MARK_LOCAL_ABORT: + return sp->call->local_abort; + default: + BUG(); + } +} + +EXPORT_SYMBOL(rxrpc_kernel_get_abort_code); + +/** + * rxrpc_kernel_get_error - Get the error number from an RxRPC error message + * @skb: Message indicating an error + * + * Get the error number from an RxRPC error message. + */ +int rxrpc_kernel_get_error_number(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + return sp->error; +} + +EXPORT_SYMBOL(rxrpc_kernel_get_error_number); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index bab56ed64..63afa9e9c 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -56,9 +58,9 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) struct rxrpc_key_token *token; int ret; - _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key)); - token = conn->key->payload.data[0]; + token = conn->params.key->payload.data[0]; conn->security_ix = token->security_index; ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); @@ -72,7 +74,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) sizeof(token->kad->session_key)) < 0) BUG(); - switch (conn->security_level) { + switch (conn->params.security_level) { case RXRPC_SECURITY_PLAIN: break; case RXRPC_SECURITY_AUTH: @@ -101,43 +103,43 @@ error: * prime the encryption state with the invariant parts of a connection's * description */ -static void rxkad_prime_packet_security(struct rxrpc_connection *conn) +static int rxkad_prime_packet_security(struct rxrpc_connection *conn) { struct rxrpc_key_token *token; SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); - struct scatterlist sg[2]; + struct scatterlist sg; struct rxrpc_crypt iv; - struct { - __be32 x[4]; - } tmpbuf __attribute__((aligned(16))); /* must all be in same page */ + __be32 *tmpbuf; + size_t tmpsize = 4 * sizeof(__be32); _enter(""); - if (!conn->key) - return; + if (!conn->params.key) + return 0; - token = conn->key->payload.data[0]; - memcpy(&iv, token->kad->session_key, sizeof(iv)); + tmpbuf = kmalloc(tmpsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; - tmpbuf.x[0] = htonl(conn->epoch); - tmpbuf.x[1] = htonl(conn->cid); - tmpbuf.x[2] = 0; - tmpbuf.x[3] = htonl(conn->security_ix); + token = conn->params.key->payload.data[0]; + memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); - sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + tmpbuf[0] = htonl(conn->proto.epoch); + tmpbuf[1] = htonl(conn->proto.cid); + tmpbuf[2] = 0; + tmpbuf[3] = htonl(conn->security_ix); + sg_init_one(&sg, tmpbuf, tmpsize); skcipher_request_set_tfm(req, conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); - + skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv)); - ASSERTCMP((u32 __force)conn->csum_iv.n[0], ==, (u32 __force)tmpbuf.x[2]); - - _leave(""); + memcpy(&conn->csum_iv, tmpbuf + 2, sizeof(conn->csum_iv)); + kfree(tmpbuf); + _leave(" = 0"); + return 0; } /* @@ -150,12 +152,9 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, { struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); + struct rxkad_level1_hdr hdr; struct rxrpc_crypt iv; - struct scatterlist sg[2]; - struct { - struct rxkad_level1_hdr hdr; - __be32 first; /* first four bytes of data and padding */ - } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + struct scatterlist sg; u16 check; sp = rxrpc_skb(skb); @@ -165,24 +164,19 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, check = sp->hdr.seq ^ sp->hdr.callNumber; data_size |= (u32)check << 16; - tmpbuf.hdr.data_size = htonl(data_size); - memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first)); + hdr.data_size = htonl(data_size); + memcpy(sechdr, &hdr, sizeof(hdr)); /* start the encryption afresh */ memset(&iv, 0, sizeof(iv)); - sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); - sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); - + sg_init_one(&sg, sechdr, 8); skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); - + skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - memcpy(sechdr, &tmpbuf, sizeof(tmpbuf)); - _leave(" = 0"); return 0; } @@ -196,8 +190,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, void *sechdr) { const struct rxrpc_key_token *token; - struct rxkad_level2_hdr rxkhdr - __attribute__((aligned(8))); /* must be all on one page */ + struct rxkad_level2_hdr rxkhdr; struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; @@ -216,18 +209,16 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr.data_size = htonl(data_size | (u32)check << 16); rxkhdr.checksum = 0; + memcpy(sechdr, &rxkhdr, sizeof(rxkhdr)); /* encrypt from the session key */ - token = call->conn->key->payload.data[0]; + token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); - sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr)); - skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(rxkhdr), iv.x); - + skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x); crypto_skcipher_encrypt(req); /* we want to encrypt the skbuff in-place */ @@ -241,9 +232,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, sg_init_table(sg, nsg); skb_to_sgvec(skb, sg, 0, len); - skcipher_request_set_crypt(req, sg, sg, len, iv.x); - crypto_skcipher_encrypt(req); _leave(" = 0"); @@ -257,7 +246,7 @@ out: /* * checksum an RxRPC packet header */ -static int rxkad_secure_packet(const struct rxrpc_call *call, +static int rxkad_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, size_t data_size, void *sechdr) @@ -265,23 +254,20 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; - struct scatterlist sg[2]; - struct { - __be32 x[2]; - } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + struct scatterlist sg; u32 x, y; int ret; sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u},%zu,", - call->debug_id, key_serial(call->conn->key), sp->hdr.seq, - data_size); + call->debug_id, key_serial(call->conn->params.key), + sp->hdr.seq, data_size); if (!call->conn->cipher) return 0; - ret = key_validate(call->conn->key); + ret = key_validate(call->conn->params.key); if (ret < 0) return ret; @@ -291,26 +277,23 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, /* calculate the security checksum */ x = call->channel << (32 - RXRPC_CIDSHIFT); x |= sp->hdr.seq & 0x3fffffff; - tmpbuf.x[0] = htonl(sp->hdr.callNumber); - tmpbuf.x[1] = htonl(x); - - sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); - sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + call->crypto_buf[0] = htonl(sp->hdr.callNumber); + call->crypto_buf[1] = htonl(x); + sg_init_one(&sg, call->crypto_buf, 8); skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); - + skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - y = ntohl(tmpbuf.x[1]); + y = ntohl(call->crypto_buf[1]); y = (y >> 16) & 0xffff; if (y == 0) y = 1; /* zero checksums are not permitted */ sp->hdr.cksum = y; - switch (call->conn->security_level) { + switch (call->conn->params.security_level) { case RXRPC_SECURITY_PLAIN: ret = 0; break; @@ -365,7 +348,6 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, 8, iv.x); - crypto_skcipher_decrypt(req); skcipher_request_zero(req); @@ -444,13 +426,12 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, skb_to_sgvec(skb, sg, 0, skb->len); /* decrypt from the session key */ - token = call->conn->key->payload.data[0]; + token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x); - crypto_skcipher_decrypt(req); skcipher_request_zero(req); if (sg != _sg) @@ -496,17 +477,14 @@ nomem: /* * verify the security on a received packet */ -static int rxkad_verify_packet(const struct rxrpc_call *call, +static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, u32 *_abort_code) { SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_skb_priv *sp; struct rxrpc_crypt iv; - struct scatterlist sg[2]; - struct { - __be32 x[2]; - } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + struct scatterlist sg; u16 cksum; u32 x, y; int ret; @@ -514,7 +492,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u}", - call->debug_id, key_serial(call->conn->key), sp->hdr.seq); + call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq); if (!call->conn->cipher) return 0; @@ -531,20 +509,17 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, /* validate the security checksum */ x = call->channel << (32 - RXRPC_CIDSHIFT); x |= sp->hdr.seq & 0x3fffffff; - tmpbuf.x[0] = htonl(call->call_id); - tmpbuf.x[1] = htonl(x); - - sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); - sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + call->crypto_buf[0] = htonl(call->call_id); + call->crypto_buf[1] = htonl(x); + sg_init_one(&sg, call->crypto_buf, 8); skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); - + skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - y = ntohl(tmpbuf.x[1]); + y = ntohl(call->crypto_buf[1]); cksum = (y >> 16) & 0xffff; if (cksum == 0) cksum = 1; /* zero checksums are not permitted */ @@ -555,7 +530,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, return -EPROTO; } - switch (call->conn->security_level) { + switch (call->conn->params.security_level) { case RXRPC_SECURITY_PLAIN: ret = 0; break; @@ -587,9 +562,9 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) u32 serial; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); - ret = key_validate(conn->key); + ret = key_validate(conn->params.key); if (ret < 0) return ret; @@ -600,14 +575,14 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) challenge.min_level = htonl(0); challenge.__padding = 0; - msg.msg_name = &conn->trans->peer->srx.transport.sin; - msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_name = &conn->params.peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->params.peer->srx.transport.sin); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; - whdr.epoch = htonl(conn->epoch); - whdr.cid = htonl(conn->cid); + whdr.epoch = htonl(conn->proto.epoch); + whdr.cid = htonl(conn->proto.cid); whdr.callNumber = 0; whdr.seq = 0; whdr.type = RXRPC_PACKET_TYPE_CHALLENGE; @@ -615,7 +590,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) whdr.userStatus = 0; whdr.securityIndex = conn->security_ix; whdr._rsvd = 0; - whdr.serviceId = htons(conn->service_id); + whdr.serviceId = htons(conn->params.service_id); iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); @@ -628,7 +603,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) whdr.serial = htonl(serial); _proto("Tx CHALLENGE %%%u", serial); - ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ret < 0) { _debug("sendmsg failed: %d", ret); return -EAGAIN; @@ -655,8 +630,8 @@ static int rxkad_send_response(struct rxrpc_connection *conn, _enter(""); - msg.msg_name = &conn->trans->peer->srx.transport.sin; - msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_name = &conn->params.peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->params.peer->srx.transport.sin); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -682,7 +657,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn, whdr.serial = htonl(serial); _proto("Tx RESPONSE %%%u", serial); - ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len); + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len); if (ret < 0) { _debug("sendmsg failed: %d", ret); return -EAGAIN; @@ -707,29 +682,6 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) response->encrypted.checksum = htonl(csum); } -/* - * load a scatterlist with a potentially split-page buffer - */ -static void rxkad_sg_set_buf2(struct scatterlist sg[2], - void *buf, size_t buflen) -{ - int nsg = 1; - - sg_init_table(sg, 2); - - sg_set_buf(&sg[0], buf, buflen); - if (sg[0].offset + buflen > PAGE_SIZE) { - /* the buffer was split over two pages */ - sg[0].length = PAGE_SIZE - sg[0].offset; - sg_set_buf(&sg[1], buf + sg[0].length, buflen - sg[0].length); - nsg++; - } - - sg_mark_end(&sg[nsg - 1]); - - ASSERTCMP(sg[0].length + sg[1].length, ==, buflen); -} - /* * encrypt the response packet */ @@ -739,17 +691,16 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn, { SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct rxrpc_crypt iv; - struct scatterlist sg[2]; + struct scatterlist sg[1]; /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted)); - + sg_init_table(sg, 1); + sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_tfm(req, conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_encrypt(req); skcipher_request_zero(req); } @@ -769,14 +720,14 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, u32 version, nonce, min_level, abort_code; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); - if (!conn->key) { + if (!conn->params.key) { _leave(" = -EPROTO [no key]"); return -EPROTO; } - ret = key_validate(conn->key); + ret = key_validate(conn->params.key); if (ret < 0) { *_abort_code = RXKADEXPIRED; return ret; @@ -799,31 +750,27 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, goto protocol_error; abort_code = RXKADLEVELFAIL; - if (conn->security_level < min_level) + if (conn->params.security_level < min_level) goto protocol_error; - token = conn->key->payload.data[0]; + token = conn->params.key->payload.data[0]; /* build the response packet */ memset(&resp, 0, sizeof(resp)); resp.version = htonl(RXKAD_VERSION); - resp.encrypted.epoch = htonl(conn->epoch); - resp.encrypted.cid = htonl(conn->cid); + resp.encrypted.epoch = htonl(conn->proto.epoch); + resp.encrypted.cid = htonl(conn->proto.cid); resp.encrypted.securityIndex = htonl(conn->security_ix); resp.encrypted.inc_nonce = htonl(nonce + 1); - resp.encrypted.level = htonl(conn->security_level); + resp.encrypted.level = htonl(conn->params.security_level); resp.kvno = htonl(token->kad->kvno); resp.ticket_len = htonl(token->kad->ticket_len); - resp.encrypted.call_id[0] = - htonl(conn->channels[0] ? conn->channels[0]->call_id : 0); - resp.encrypted.call_id[1] = - htonl(conn->channels[1] ? conn->channels[1]->call_id : 0); - resp.encrypted.call_id[2] = - htonl(conn->channels[2] ? conn->channels[2]->call_id : 0); - resp.encrypted.call_id[3] = - htonl(conn->channels[3] ? conn->channels[3]->call_id : 0); + resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); /* calculate the response checksum and then do the encryption */ rxkad_calc_response_checksum(&resp); @@ -885,10 +832,8 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, } sg_init_one(&sg[0], ticket, ticket_len); - skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, ticket_len, iv.x); - crypto_skcipher_decrypt(req); skcipher_request_free(req); @@ -999,7 +944,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, const struct rxrpc_crypt *session_key) { SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); - struct scatterlist sg[2]; + struct scatterlist sg[1]; struct rxrpc_crypt iv; _enter(",,%08x%08x", @@ -1014,12 +959,11 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, memcpy(&iv, session_key, sizeof(iv)); - rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted)); - + sg_init_table(sg, 1); + sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_tfm(req, rxkad_ci); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_decrypt(req); skcipher_request_zero(req); @@ -1043,7 +987,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, void *ticket; u32 abort_code, version, kvno, ticket_len, level; __be32 csum; - int ret; + int ret, i; _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); @@ -1094,9 +1038,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, rxkad_decrypt_response(conn, &response, &session_key); abort_code = RXKADSEALEDINCON; - if (ntohl(response.encrypted.epoch) != conn->epoch) + if (ntohl(response.encrypted.epoch) != conn->proto.epoch) goto protocol_error_free; - if (ntohl(response.encrypted.cid) != conn->cid) + if (ntohl(response.encrypted.cid) != conn->proto.cid) goto protocol_error_free; if (ntohl(response.encrypted.securityIndex) != conn->security_ix) goto protocol_error_free; @@ -1106,11 +1050,26 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (response.encrypted.checksum != csum) goto protocol_error_free; - if (ntohl(response.encrypted.call_id[0]) > INT_MAX || - ntohl(response.encrypted.call_id[1]) > INT_MAX || - ntohl(response.encrypted.call_id[2]) > INT_MAX || - ntohl(response.encrypted.call_id[3]) > INT_MAX) - goto protocol_error_free; + spin_lock(&conn->channel_lock); + for (i = 0; i < RXRPC_MAXCALLS; i++) { + struct rxrpc_call *call; + u32 call_id = ntohl(response.encrypted.call_id[i]); + + if (call_id > INT_MAX) + goto protocol_error_unlock; + + if (call_id < conn->channels[i].call_counter) + goto protocol_error_unlock; + if (call_id > conn->channels[i].call_counter) { + call = rcu_dereference_protected( + conn->channels[i].call, + lockdep_is_held(&conn->channel_lock)); + if (call && call->state < RXRPC_CALL_COMPLETE) + goto protocol_error_unlock; + conn->channels[i].call_counter = call_id; + } + } + spin_unlock(&conn->channel_lock); abort_code = RXKADOUTOFSEQUENCE; if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1) @@ -1120,7 +1079,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, level = ntohl(response.encrypted.level); if (level > RXRPC_SECURITY_ENCRYPT) goto protocol_error_free; - conn->security_level = level; + conn->params.security_level = level; /* create a key to hold the security data and expiration time - after * this the connection security can be handled in exactly the same way @@ -1135,6 +1094,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _leave(" = 0"); return 0; +protocol_error_unlock: + spin_unlock(&conn->channel_lock); protocol_error_free: kfree(ticket); protocol_error: diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c new file mode 100644 index 000000000..814d285ff --- /dev/null +++ b/net/rxrpc/security.c @@ -0,0 +1,168 @@ +/* RxRPC security handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_security_methods); +static DECLARE_RWSEM(rxrpc_security_sem); + +static const struct rxrpc_security *rxrpc_security_types[] = { + [RXRPC_SECURITY_NONE] = &rxrpc_no_security, +#ifdef CONFIG_RXKAD + [RXRPC_SECURITY_RXKAD] = &rxkad, +#endif +}; + +int __init rxrpc_init_security(void) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) { + if (rxrpc_security_types[i]) { + ret = rxrpc_security_types[i]->init(); + if (ret < 0) + goto failed; + } + } + + return 0; + +failed: + for (i--; i >= 0; i--) + if (rxrpc_security_types[i]) + rxrpc_security_types[i]->exit(); + return ret; +} + +void rxrpc_exit_security(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) + if (rxrpc_security_types[i]) + rxrpc_security_types[i]->exit(); +} + +/* + * look up an rxrpc security module + */ +static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) +{ + if (security_index >= ARRAY_SIZE(rxrpc_security_types)) + return NULL; + return rxrpc_security_types[security_index]; +} + +/* + * initialise the security on a client connection + */ +int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) +{ + const struct rxrpc_security *sec; + struct rxrpc_key_token *token; + struct key *key = conn->params.key; + int ret; + + _enter("{%d},{%x}", conn->debug_id, key_serial(key)); + + if (!key) + return 0; + + ret = key_validate(key); + if (ret < 0) + return ret; + + token = key->payload.data[0]; + if (!token) + return -EKEYREJECTED; + + sec = rxrpc_security_lookup(token->security_index); + if (!sec) + return -EKEYREJECTED; + conn->security = sec; + + ret = conn->security->init_connection_security(conn); + if (ret < 0) { + conn->security = &rxrpc_no_security; + return ret; + } + + _leave(" = 0"); + return 0; +} + +/* + * initialise the security on a server connection + */ +int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) +{ + const struct rxrpc_security *sec; + struct rxrpc_local *local = conn->params.local; + struct rxrpc_sock *rx; + struct key *key; + key_ref_t kref; + char kdesc[5 + 1 + 3 + 1]; + + _enter(""); + + sprintf(kdesc, "%u:%u", conn->params.service_id, conn->security_ix); + + sec = rxrpc_security_lookup(conn->security_ix); + if (!sec) { + _leave(" = -ENOKEY [lookup]"); + return -ENOKEY; + } + + /* find the service */ + read_lock_bh(&local->services_lock); + list_for_each_entry(rx, &local->services, listen_link) { + if (rx->srx.srx_service == conn->params.service_id) + goto found_service; + } + + /* the service appears to have died */ + read_unlock_bh(&local->services_lock); + _leave(" = -ENOENT"); + return -ENOENT; + +found_service: + if (!rx->securities) { + read_unlock_bh(&local->services_lock); + _leave(" = -ENOKEY"); + return -ENOKEY; + } + + /* look through the service's keyring */ + kref = keyring_search(make_key_ref(rx->securities, 1UL), + &key_type_rxrpc_s, kdesc); + if (IS_ERR(kref)) { + read_unlock_bh(&local->services_lock); + _leave(" = %ld [search]", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + key = key_ref_to_ptr(kref); + read_unlock_bh(&local->services_lock); + + conn->server_key = key; + conn->security = sec; + + _leave(" = 0"); + return 0; +} diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c new file mode 100644 index 000000000..06c51d4b6 --- /dev/null +++ b/net/rxrpc/skbuff.c @@ -0,0 +1,165 @@ +/* ar-skbuff.c: socket buffer destruction handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * set up for the ACK at the end of the receive phase when we discard the final + * receive phase data packet + * - called with softirqs disabled + */ +static void rxrpc_request_final_ACK(struct rxrpc_call *call) +{ + /* the call may be aborted before we have a chance to ACK it */ + write_lock(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + call->state = RXRPC_CALL_CLIENT_FINAL_ACK; + _debug("request final ACK"); + + /* get an extra ref on the call for the final-ACK generator to + * release */ + rxrpc_get_call(call); + set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); + if (try_to_del_timer_sync(&call->ack_timer) >= 0) + rxrpc_queue_call(call); + break; + + case RXRPC_CALL_SERVER_RECV_REQUEST: + call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + default: + break; + } + + write_unlock(&call->state_lock); +} + +/* + * drop the bottom ACK off of the call ACK window and advance the window + */ +static void rxrpc_hard_ACK_data(struct rxrpc_call *call, + struct rxrpc_skb_priv *sp) +{ + int loop; + u32 seq; + + spin_lock_bh(&call->lock); + + _debug("hard ACK #%u", sp->hdr.seq); + + for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { + call->ackr_window[loop] >>= 1; + call->ackr_window[loop] |= + call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); + } + + seq = sp->hdr.seq; + ASSERTCMP(seq, ==, call->rx_data_eaten + 1); + call->rx_data_eaten = seq; + + if (call->ackr_win_top < UINT_MAX) + call->ackr_win_top++; + + ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, + call->rx_data_post, >=, call->rx_data_recv); + ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, + call->rx_data_recv, >=, call->rx_data_eaten); + + if (sp->hdr.flags & RXRPC_LAST_PACKET) { + rxrpc_request_final_ACK(call); + } else if (atomic_dec_and_test(&call->ackr_not_idle) && + test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { + /* We previously soft-ACK'd some received packets that have now + * been consumed, so send a hard-ACK if no more packets are + * immediately forthcoming to allow the transmitter to free up + * its Tx bufferage. + */ + _debug("send Rx idle ACK"); + __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, + false); + } + + spin_unlock_bh(&call->lock); +} + +/** + * rxrpc_kernel_data_consumed - Record consumption of data message + * @call: The call to which the message pertains. + * @skb: Message holding data + * + * Record the consumption of a data message and generate an ACK if appropriate. + * The call state is shifted if this was the final packet. The caller must be + * in process context with no spinlocks held. + * + * TODO: Actually generate the ACK here rather than punting this to the + * workqueue. + */ +void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq); + + ASSERTCMP(sp->call, ==, call); + ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA); + + /* TODO: Fix the sequence number tracking */ + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); + + call->rx_data_recv = sp->hdr.seq; + rxrpc_hard_ACK_data(call, sp); +} +EXPORT_SYMBOL(rxrpc_kernel_data_consumed); + +/* + * Destroy a packet that has an RxRPC control buffer + */ +void rxrpc_packet_destructor(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = sp->call; + + _enter("%p{%p}", skb, call); + + if (call) { + if (atomic_dec_return(&call->skb_count) < 0) + BUG(); + rxrpc_put_call(call); + sp->call = NULL; + } + + if (skb->sk) + sock_rfree(skb); + _leave(""); +} + +/** + * rxrpc_kernel_free_skb - Free an RxRPC socket buffer + * @skb: The socket buffer to be freed + * + * Let RxRPC free its own socket buffer, permitting it to maintain debug + * accounting. + */ +void rxrpc_kernel_free_skb(struct sk_buff *skb) +{ + rxrpc_free_skb(skb); +} +EXPORT_SYMBOL(rxrpc_kernel_free_skb); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index d20ed575a..03ad08774 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -18,6 +18,7 @@ static struct ctl_table_header *rxrpc_sysctl_reg_table; static const unsigned int zero = 0; static const unsigned int one = 1; static const unsigned int four = 4; +static const unsigned int thirtytwo = 32; static const unsigned int n_65535 = 65535; static const unsigned int n_max_acks = RXRPC_MAXACKS; @@ -89,16 +90,17 @@ static struct ctl_table rxrpc_sysctl_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, }, + + /* Non-time values */ { - .procname = "transport_expiry", - .data = &rxrpc_transport_expiry, + .procname = "max_backlog", + .data = &rxrpc_max_backlog, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)&four, + .extra2 = (void *)&thirtytwo, }, - - /* Non-time values */ { .procname = "rx_window_size", .data = &rxrpc_rx_window_size, diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c new file mode 100644 index 000000000..b88914d53 --- /dev/null +++ b/net/rxrpc/utils.c @@ -0,0 +1,46 @@ +/* Utility routines + * + * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include "ar-internal.h" + +/* + * Fill out a peer address from a socket buffer containing a packet. + */ +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb) +{ + memset(srx, 0, sizeof(*srx)); + + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.sin.sin_family = AF_INET; + srx->transport.sin.sin_port = udp_hdr(skb)->source; + srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + return 0; + + case ETH_P_IPV6: + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin6); + srx->transport.sin6.sin6_family = AF_INET6; + srx->transport.sin6.sin6_port = udp_hdr(skb)->source; + srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr; + return 0; + + default: + pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n", + ntohs(skb->protocol)); + return -EAFNOSUPPORT; + } +} diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b148302bb..ccf931b3b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -494,6 +494,16 @@ config NET_CLS_FLOWER To compile this code as a module, choose M here: the module will be called cls_flower. +config NET_CLS_MATCHALL + tristate "Match-all classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + nothing. Every packet will match. + + To compile this code as a module, choose M here: the module will + be called cls_matchall. + config NET_EMATCH bool "Extended Matches" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 84bddb373..ae088a5a9 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o +obj-$(CONFIG_NET_CLS_MATCHALL) += cls_matchall.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index c7a0b0d48..d09d06875 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -29,45 +29,42 @@ static void free_tcf(struct rcu_head *head) { - struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu); + struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu); free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); kfree(p); } -static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) +static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p) { - struct tcf_common *p = a->priv; - spin_lock_bh(&hinfo->lock); - hlist_del(&p->tcfc_head); + hlist_del(&p->tcfa_head); spin_unlock_bh(&hinfo->lock); - gen_kill_estimator(&p->tcfc_bstats, - &p->tcfc_rate_est); + gen_kill_estimator(&p->tcfa_bstats, + &p->tcfa_rate_est); /* - * gen_estimator est_timer() might access p->tcfc_lock + * gen_estimator est_timer() might access p->tcfa_lock * or bstats, wait a RCU grace period before freeing p */ - call_rcu(&p->tcfc_rcu, free_tcf); + call_rcu(&p->tcfa_rcu, free_tcf); } -int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) +int __tcf_hash_release(struct tc_action *p, bool bind, bool strict) { - struct tcf_common *p = a->priv; int ret = 0; if (p) { if (bind) - p->tcfc_bindcnt--; - else if (strict && p->tcfc_bindcnt > 0) + p->tcfa_bindcnt--; + else if (strict && p->tcfa_bindcnt > 0) return -EPERM; - p->tcfc_refcnt--; - if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { - if (a->ops->cleanup) - a->ops->cleanup(a, bind); - tcf_hash_destroy(a->hinfo, a); + p->tcfa_refcnt--; + if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { + if (p->ops->cleanup) + p->ops->cleanup(p, bind); + tcf_hash_destroy(p->hinfo, p); ret = ACT_P_DELETED; } } @@ -77,10 +74,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) EXPORT_SYMBOL(__tcf_hash_release); static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, - struct netlink_callback *cb, struct tc_action *a) + struct netlink_callback *cb) { - struct hlist_head *head; - struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; @@ -89,19 +84,20 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, s_i = cb->args[0]; for (i = 0; i < (hinfo->hmask + 1); i++) { + struct hlist_head *head; + struct tc_action *p; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - hlist_for_each_entry_rcu(p, head, tcfc_head) { + hlist_for_each_entry_rcu(p, head, tcfa_head) { index++; if (index < s_i) continue; - a->priv = p; - a->order = n_i; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start(skb, n_i); if (nest == NULL) goto nla_put_failure; - err = tcf_action_dump_1(skb, a, 0, 0); + err = tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); @@ -125,27 +121,27 @@ nla_put_failure: } static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, - struct tc_action *a) + const struct tc_action_ops *ops) { - struct hlist_head *head; - struct hlist_node *n; - struct tcf_common *p; struct nlattr *nest; int i = 0, n_i = 0; int ret = -EINVAL; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start(skb, 0); if (nest == NULL) goto nla_put_failure; - if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; for (i = 0; i < (hinfo->hmask + 1); i++) { + struct hlist_head *head; + struct hlist_node *n; + struct tc_action *p; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - hlist_for_each_entry_safe(p, n, head, tcfc_head) { - a->priv = p; - ret = __tcf_hash_release(a, false, true); + hlist_for_each_entry_safe(p, n, head, tcfa_head) { + ret = __tcf_hash_release(p, false, true); if (ret == ACT_P_DELETED) { - module_put(a->ops->owner); + module_put(p->ops->owner); n_i++; } else if (ret < 0) goto nla_put_failure; @@ -163,16 +159,14 @@ nla_put_failure: int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tcf_hashinfo *hinfo = tn->hinfo; - a->hinfo = hinfo; - if (type == RTM_DELACTION) { - return tcf_del_walker(hinfo, skb, a); + return tcf_del_walker(hinfo, skb, ops); } else if (type == RTM_GETACTION) { - return tcf_dump_walker(hinfo, skb, cb, a); + return tcf_dump_walker(hinfo, skb, cb); } else { WARN(1, "tcf_generic_walker: unknown action %d\n", type); return -EINVAL; @@ -180,15 +174,15 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } EXPORT_SYMBOL(tcf_generic_walker); -static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +static struct tc_action *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { - struct tcf_common *p = NULL; + struct tc_action *p = NULL; struct hlist_head *head; spin_lock_bh(&hinfo->lock); head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; - hlist_for_each_entry_rcu(p, head, tcfc_head) - if (p->tcfc_index == index) + hlist_for_each_entry_rcu(p, head, tcfa_head) + if (p->tcfa_index == index) break; spin_unlock_bh(&hinfo->lock); @@ -210,59 +204,58 @@ u32 tcf_hash_new_index(struct tc_action_net *tn) } EXPORT_SYMBOL(tcf_hash_new_index); -int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) +int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_hashinfo *hinfo = tn->hinfo; - struct tcf_common *p = tcf_hash_lookup(index, hinfo); + struct tc_action *p = tcf_hash_lookup(index, hinfo); if (p) { - a->priv = p; - a->hinfo = hinfo; + *a = p; return 1; } return 0; } EXPORT_SYMBOL(tcf_hash_search); -int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, - int bind) +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a, + int bind) { struct tcf_hashinfo *hinfo = tn->hinfo; - struct tcf_common *p = NULL; + struct tc_action *p = NULL; + if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { if (bind) - p->tcfc_bindcnt++; - p->tcfc_refcnt++; - a->priv = p; - a->hinfo = hinfo; - return 1; + p->tcfa_bindcnt++; + p->tcfa_refcnt++; + *a = p; + return true; } - return 0; + return false; } EXPORT_SYMBOL(tcf_hash_check); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) { - struct tcf_common *pc = a->priv; if (est) - gen_kill_estimator(&pc->tcfc_bstats, - &pc->tcfc_rate_est); - call_rcu(&pc->tcfc_rcu, free_tcf); + gen_kill_estimator(&a->tcfa_bstats, + &a->tcfa_rate_est); + call_rcu(&a->tcfa_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_cleanup); int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, - struct tc_action *a, int size, int bind, bool cpustats) + struct tc_action **a, const struct tc_action_ops *ops, + int bind, bool cpustats) { - struct tcf_common *p = kzalloc(size, GFP_KERNEL); + struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_hashinfo *hinfo = tn->hinfo; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; - p->tcfc_refcnt = 1; + p->tcfa_refcnt = 1; if (bind) - p->tcfc_bindcnt = 1; + p->tcfa_bindcnt = 1; if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); @@ -278,35 +271,37 @@ err2: goto err1; } } - spin_lock_init(&p->tcfc_lock); - INIT_HLIST_NODE(&p->tcfc_head); - p->tcfc_index = index ? index : tcf_hash_new_index(tn); - p->tcfc_tm.install = jiffies; - p->tcfc_tm.lastuse = jiffies; + spin_lock_init(&p->tcfa_lock); + INIT_HLIST_NODE(&p->tcfa_head); + p->tcfa_index = index ? index : tcf_hash_new_index(tn); + p->tcfa_tm.install = jiffies; + p->tcfa_tm.lastuse = jiffies; + p->tcfa_tm.firstuse = 0; if (est) { - err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, - &p->tcfc_rate_est, - &p->tcfc_lock, est); + err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, + &p->tcfa_rate_est, + &p->tcfa_lock, NULL, est); if (err) { free_percpu(p->cpu_qstats); goto err2; } } - a->priv = (void *) p; - a->hinfo = hinfo; + p->hinfo = hinfo; + p->ops = ops; + INIT_LIST_HEAD(&p->list); + *a = p; return 0; } EXPORT_SYMBOL(tcf_hash_create); void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a) { - struct tcf_common *p = a->priv; struct tcf_hashinfo *hinfo = tn->hinfo; - unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); + unsigned int h = tcf_hash(a->tcfa_index, hinfo->hmask); spin_lock_bh(&hinfo->lock); - hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); + hlist_add_head(&a->tcfa_head, &hinfo->htab[h]); spin_unlock_bh(&hinfo->lock); } EXPORT_SYMBOL(tcf_hash_insert); @@ -314,21 +309,16 @@ EXPORT_SYMBOL(tcf_hash_insert); void tcf_hashinfo_destroy(const struct tc_action_ops *ops, struct tcf_hashinfo *hinfo) { - struct tc_action a = { - .ops = ops, - .hinfo = hinfo, - }; int i; for (i = 0; i < hinfo->hmask + 1; i++) { - struct tcf_common *p; + struct tc_action *p; struct hlist_node *n; - hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) { + hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfa_head) { int ret; - a.priv = p; - ret = __tcf_hash_release(&a, false, true); + ret = __tcf_hash_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) @@ -430,18 +420,19 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) return res; } -int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, - struct tcf_result *res) +int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, + int nr_actions, struct tcf_result *res) { - const struct tc_action *a; - int ret = -1; + int ret = -1, i; if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); ret = TC_ACT_OK; goto exec_done; } - list_for_each_entry(a, actions, list) { + for (i = 0; i < nr_actions; i++) { + const struct tc_action *a = actions[i]; + repeat: ret = a->ops->act(skb, a, res); if (ret == TC_ACT_REPEAT) @@ -465,8 +456,6 @@ int tcf_action_destroy(struct list_head *actions, int bind) module_put(a->ops->owner); else if (ret < 0) return ret; - list_del(&a->list); - kfree(a); } return ret; } @@ -503,8 +492,8 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_action_dump_1); -int -tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref) +int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, + int bind, int ref) { struct tc_action *a; int err = -EINVAL; @@ -580,20 +569,13 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, goto err_out; } - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_KERNEL); - if (a == NULL) - goto err_mod; - - a->ops = a_o; - INIT_LIST_HEAD(&a->list); /* backward compatibility for policer */ if (name == NULL) - err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind); else - err = a_o->init(net, nla, est, a, ovr, bind); + err = a_o->init(net, nla, est, &a, ovr, bind); if (err < 0) - goto err_free; + goto err_mod; /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then @@ -604,8 +586,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, return a; -err_free: - kfree(a); err_mod: module_put(a_o->owner); err_out: @@ -641,12 +621,11 @@ err: return err; } -int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, +int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { int err = 0; struct gnet_dump d; - struct tcf_common *p = a->priv; if (p == NULL) goto errout; @@ -655,27 +634,27 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, * to add additional backward compatibility statistic TLVs. */ if (compat_mode) { - if (a->type == TCA_OLD_COMPAT) + if (p->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, TCA_STATS, TCA_XSTATS, - &p->tcfc_lock, &d, + &p->tcfa_lock, &d, TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, - &p->tcfc_lock, &d, TCA_ACT_PAD); + &p->tcfa_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 || - gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, - &p->tcfc_rate_est) < 0 || + if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfa_bstats, + &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfc_qstats, - p->tcfc_qstats.qlen) < 0) + &p->tcfa_qstats, + p->tcfa_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) @@ -687,9 +666,9 @@ errout: return -1; } -static int -tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq, - u16 flags, int event, int bind, int ref) +static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, + u32 portid, u32 seq, u16 flags, int event, int bind, + int ref) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -730,7 +709,8 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, + 0, 0) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -738,24 +718,11 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, return rtnl_unicast(skb, net, portid); } -static struct tc_action *create_a(int i) -{ - struct tc_action *act; - - act = kzalloc(sizeof(*act), GFP_KERNEL); - if (act == NULL) { - pr_debug("create_a: failed to alloc!\n"); - return NULL; - } - act->order = i; - INIT_LIST_HEAD(&act->list); - return act; -} - static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid) { struct nlattr *tb[TCA_ACT_MAX + 1]; + const struct tc_action_ops *ops; struct tc_action *a; int index; int err; @@ -770,40 +737,23 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, goto err_out; index = nla_get_u32(tb[TCA_ACT_INDEX]); - err = -ENOMEM; - a = create_a(0); - if (a == NULL) - goto err_out; - err = -EINVAL; - a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); - if (a->ops == NULL) /* could happen in batch of actions */ - goto err_free; + ops = tc_lookup_action(tb[TCA_ACT_KIND]); + if (!ops) /* could happen in batch of actions */ + goto err_out; err = -ENOENT; - if (a->ops->lookup(net, a, index) == 0) + if (ops->lookup(net, &a, index) == 0) goto err_mod; - module_put(a->ops->owner); + module_put(ops->owner); return a; err_mod: - module_put(a->ops->owner); -err_free: - kfree(a); + module_put(ops->owner); err_out: return ERR_PTR(err); } -static void cleanup_a(struct list_head *actions) -{ - struct tc_action *a, *tmp; - - list_for_each_entry_safe(a, tmp, actions, list) { - list_del(&a->list); - kfree(a); - } -} - static int tca_action_flush(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid) { @@ -814,8 +764,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, struct netlink_callback dcb; struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; + const struct tc_action_ops *ops; struct nlattr *kind; - struct tc_action a; int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); @@ -832,13 +782,12 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, err = -EINVAL; kind = tb[TCA_ACT_KIND]; - memset(&a, 0, sizeof(struct tc_action)); - INIT_LIST_HEAD(&a.list); - a.ops = tc_lookup_action(kind); - if (a.ops == NULL) /*some idjot trying to flush unknown action */ + ops = tc_lookup_action(kind); + if (!ops) /*some idjot trying to flush unknown action */ goto err_out; - nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); + nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, + sizeof(*t), 0); if (!nlh) goto out_module_put; t = nlmsg_data(nlh); @@ -850,7 +799,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (nest == NULL) goto out_module_put; - err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a); + err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); if (err < 0) goto out_module_put; if (err == 0) @@ -860,7 +809,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; - module_put(a.ops->owner); + module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) @@ -869,7 +818,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, return err; out_module_put: - module_put(a.ops->owner); + module_put(ops->owner); err_out: noflush_out: kfree_skb(skb); @@ -946,7 +895,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; } err: - cleanup_a(&actions); + tcf_action_destroy(&actions, 0); return ret; } @@ -983,15 +932,9 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); if (ret) - goto done; + return ret; - /* dump then free all the actions after update; inserted policy - * stays intact - */ - ret = tcf_add_notify(net, n, &actions, portid); - cleanup_a(&actions); -done: - return ret; + return tcf_add_notify(net, n, &actions, portid); } static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) @@ -1001,7 +944,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; - if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETACTION) && + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); @@ -1080,7 +1024,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; struct tc_action_ops *a_o; - struct tc_action a; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *kind = find_dump_kind(cb->nlh); @@ -1094,9 +1037,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (a_o == NULL) return 0; - memset(&a, 0, sizeof(struct tc_action)); - a.ops = a_o; - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) @@ -1110,7 +1050,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a); + ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c7123e01c..bfa870731 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -34,11 +34,12 @@ struct tcf_bpf_cfg { }; static int bpf_net_id; +static struct tc_action_ops act_bpf_ops; static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { - struct tcf_bpf *prog = act->priv; + struct tcf_bpf *prog = to_bpf(act); struct bpf_prog *filter; int action, filter_res; bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; @@ -134,7 +135,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); - struct tcf_bpf *prog = act->priv; + struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, .refcnt = prog->tcf_refcnt - ref, @@ -154,10 +155,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, if (ret) goto nla_put_failure; - tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install); - tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); - tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); - + tcf_tm_dump(&tm, &prog->tcf_tm); if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, TCA_ACT_BPF_PAD)) goto nla_put_failure; @@ -172,7 +170,8 @@ nla_put_failure: static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, - [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, + [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, + .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, @@ -225,15 +224,10 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); - fp = bpf_prog_get(bpf_fd); + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT); if (IS_ERR(fp)) return PTR_ERR(fp); - if (fp->type != BPF_PROG_TYPE_SCHED_ACT) { - bpf_prog_put(fp); - return -EINVAL; - } - if (tb[TCA_ACT_BPF_NAME]) { name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]), nla_len(tb[TCA_ACT_BPF_NAME]), @@ -277,7 +271,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, } static int tcf_bpf_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *act, + struct nlattr *est, struct tc_action **act, int replace, int bind) { struct tc_action_net *tn = net_generic(net, bpf_net_id); @@ -302,7 +296,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, act, bind)) { ret = tcf_hash_create(tn, parm->index, est, act, - sizeof(*prog), bind, true); + &act_bpf_ops, bind, true); if (ret < 0) return ret; @@ -312,7 +306,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (bind) return 0; - tcf_hash_release(act, bind); + tcf_hash_release(*act, bind); if (!replace) return -EEXIST; } @@ -332,7 +326,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (ret < 0) goto out; - prog = to_bpf(act); + prog = to_bpf(*act); ASSERT_RTNL(); if (res != ACT_P_CREATED) @@ -350,7 +344,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(prog->filter, cfg.filter); if (res == ACT_P_CREATED) { - tcf_hash_insert(tn, act); + tcf_hash_insert(tn, *act); } else { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); @@ -360,7 +354,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return res; out: if (res == ACT_P_CREATED) - tcf_hash_cleanup(act, est); + tcf_hash_cleanup(*act, est); return ret; } @@ -369,20 +363,20 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind) { struct tcf_bpf_cfg tmp; - tcf_bpf_prog_fill_cfg(act->priv, &tmp); + tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp); tcf_bpf_cfg_cleanup(&tmp); } static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, bpf_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, bpf_net_id); @@ -399,6 +393,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, + .size = sizeof(struct tcf_bpf), }; static __net_init int bpf_init_net(struct net *net) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2ba700c76..eae07a2e7 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -31,6 +31,7 @@ #define CONNMARK_TAB_MASK 3 static int connmark_net_id; +static struct tc_action_ops act_connmark_ops; static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -38,13 +39,13 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; - struct tcf_connmark_info *ca = a->priv; + struct tcf_connmark_info *ca = to_connmark(a); struct nf_conntrack_zone zone; struct nf_conn *c; int proto; spin_lock(&ca->tcf_lock); - ca->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ca->tcf_tm); bstats_update(&ca->tcf_bstats, skb); if (skb->protocol == htons(ETH_P_IP)) { @@ -96,7 +97,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { }; static int tcf_connmark_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -116,22 +117,22 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*ci), bind, false); + &act_connmark_ops, bind, false); if (ret) return ret; - ci = to_connmark(a); + ci = to_connmark(*a); ci->tcf_action = parm->action; ci->net = net; ci->zone = parm->zone; - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); ret = ACT_P_CREATED; } else { - ci = to_connmark(a); + ci = to_connmark(*a); if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; /* replacing action and zone */ @@ -146,7 +147,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_connmark_info *ci = a->priv; + struct tcf_connmark_info *ci = to_connmark(a); struct tc_connmark opt = { .index = ci->tcf_index, @@ -160,9 +161,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(ci->tcf_tm.expires); + tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; @@ -175,14 +174,14 @@ nla_put_failure: static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, connmark_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -198,6 +197,7 @@ static struct tc_action_ops act_connmark_ops = { .init = tcf_connmark_init, .walk = tcf_connmark_walker, .lookup = tcf_connmark_search, + .size = sizeof(struct tcf_connmark_info), }; static __net_init int connmark_init_net(struct net *net) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 28e934ed0..b5dbf633a 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -43,9 +43,10 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { }; static int csum_net_id; +static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, csum_net_id); @@ -67,26 +68,26 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_csum_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - p = to_tcf_csum(a); + p = to_tcf_csum(*a); spin_lock_bh(&p->tcf_lock); p->tcf_action = parm->action; p->update_flags = parm->update_flags; spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -496,12 +497,12 @@ fail: static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_csum *p = a->priv; + struct tcf_csum *p = to_tcf_csum(a); int action; u32 update_flags; spin_lock(&p->tcf_lock); - p->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&p->tcf_tm); bstats_update(&p->tcf_bstats, skb); action = p->tcf_action; update_flags = p->update_flags; @@ -534,7 +535,7 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_csum *p = a->priv; + struct tcf_csum *p = to_tcf_csum(a); struct tc_csum opt = { .update_flags = p->update_flags, .index = p->tcf_index, @@ -546,9 +547,8 @@ static int tcf_csum_dump(struct sk_buff *skb, if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; @@ -561,14 +561,14 @@ nla_put_failure: static int tcf_csum_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, csum_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, csum_net_id); @@ -584,6 +584,7 @@ static struct tc_action_ops act_csum_ops = { .init = tcf_csum_init, .walk = tcf_csum_walker, .lookup = tcf_csum_search, + .size = sizeof(struct tcf_csum), }; static __net_init int csum_init_net(struct net *net) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index ec5cc8435..e24a4093d 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -26,6 +26,7 @@ #define GACT_TAB_MASK 15 static int gact_net_id; +static struct tc_action_ops act_gact_ops; #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) @@ -56,7 +57,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { }; static int tcf_gact_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, gact_net_id); @@ -93,19 +94,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*gact), bind, true); + &act_gact_ops, bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - gact = to_gact(a); + gact = to_gact(*a); ASSERT_RTNL(); gact->tcf_action = parm->action; @@ -121,14 +122,14 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); #ifdef CONFIG_GACT_PROB @@ -151,7 +152,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, u64 lastuse) { - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; @@ -162,10 +163,11 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, tm->lastuse = lastuse; } -static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, .refcnt = gact->tcf_refcnt - ref, @@ -188,9 +190,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int goto nla_put_failure; } #endif - t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); + tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; return skb->len; @@ -202,14 +202,14 @@ nla_put_failure: static int tcf_gact_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, gact_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, gact_net_id); @@ -226,6 +226,7 @@ static struct tc_action_ops act_gact_ops = { .init = tcf_gact_init, .walk = tcf_gact_walker, .lookup = tcf_gact_search, + .size = sizeof(struct tcf_gact), }; static __net_init int gact_init_net(struct net *net) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 5c4cdea21..4a60cd5e1 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -37,6 +37,7 @@ static int ife_net_id; static int max_metacnt = IFE_META_MAX + 1; +static struct tc_action_ops act_ife_ops; static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, @@ -52,7 +53,7 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) u32 *tlv = (u32 *)(skbdata); u16 totlen = nla_total_size(dlen); /*alignment + hdr */ char *dptr = (char *)tlv + NLA_HDRLEN; - u32 htlv = attrtype << 16 | dlen; + u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN); *tlv = htonl(htlv); memset(dptr, 0, totlen - NLA_HDRLEN); @@ -364,7 +365,7 @@ out_nlmsg_trim: /* under ife->tcf_lock */ static void _tcf_ife_cleanup(struct tc_action *a, int bind) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { @@ -382,7 +383,7 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) static void tcf_ife_cleanup(struct tc_action *a, int bind) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a, bind); @@ -417,7 +418,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, } static int tcf_ife_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, ife_net_id); @@ -428,7 +429,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, u16 ife_type = 0; u8 *daddr = NULL; u8 *saddr = NULL; - int ret = 0, exists = 0; + bool exists = false; + int ret = 0; int err; err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy); @@ -450,25 +452,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, **/ if (!tb[TCA_IFE_TYPE]) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); pr_info("You MUST pass etherype for encoding\n"); return -EINVAL; } } if (!exists) { - ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife), + ret = tcf_hash_create(tn, parm->index, est, a, &act_ife_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - ife = to_ife(a); + ife = to_ife(*a); ife->flags = parm->flags; if (parm->flags & IFE_ENCODE) { @@ -506,9 +508,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (err) { metadata_parse_err: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(a, bind); + _tcf_ife_cleanup(*a, bind); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -528,7 +530,7 @@ metadata_parse_err: err = use_all_metadata(ife); if (err) { if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(a, bind); + _tcf_ife_cleanup(*a, bind); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -540,7 +542,7 @@ metadata_parse_err: spin_unlock_bh(&ife->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -549,7 +551,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); struct tc_ife opt = { .index = ife->tcf_index, .refcnt = ife->tcf_refcnt - ref, @@ -562,9 +564,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(ife->tcf_tm.expires); + tcf_tm_dump(&t, &ife->tcf_tm); if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; @@ -624,15 +624,15 @@ struct meta_tlvhdr { static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; - u16 ifehdrln = ifehdr->metalen; + int ifehdrln = (int)ifehdr->metalen; struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data); spin_lock(&ife->tcf_lock); bstats_update(&ife->tcf_bstats, skb); - ife->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ife->tcf_tm); spin_unlock(&ife->tcf_lock); ifehdrln = ntohs(ifehdrln); @@ -698,7 +698,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ethhdr *oethh; /* outer ether header */ struct ethhdr *iethh; /* inner eth header */ @@ -722,7 +722,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, spin_lock(&ife->tcf_lock); bstats_update(&ife->tcf_bstats, skb); - ife->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet @@ -740,8 +740,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, return TC_ACT_SHOT; } - iethh = eth_hdr(skb); - err = skb_cow_head(skb, hdrm); if (unlikely(err)) { ife->tcf_qstats.drops++; @@ -752,6 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, if (!(at & AT_EGRESS)) skb_push(skb, skb->dev->hard_header_len); + iethh = (struct ethhdr *)skb->data; __skb_push(skb, hdrm); memcpy(skb->data, iethh, skb->mac_len); skb_reset_mac_header(skb); @@ -802,7 +801,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); if (ife->flags & IFE_ENCODE) return tcf_ife_encode(skb, a, res); @@ -813,7 +812,7 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, pr_info_ratelimited("unknown failure(policy neither de/encode\n"); spin_lock(&ife->tcf_lock); bstats_update(&ife->tcf_bstats, skb); - ife->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ife->tcf_tm); ife->tcf_qstats.drops++; spin_unlock(&ife->tcf_lock); @@ -822,14 +821,14 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, static int tcf_ife_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, ife_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ife_net_id); @@ -846,6 +845,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .walk = tcf_ife_walker, .lookup = tcf_ife_search, + .size = sizeof(struct tcf_ife_info), }; static __net_init int ife_init_net(struct net *net) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d4bd19ee5..378c1c976 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -31,10 +31,13 @@ #define IPT_TAB_MASK 15 static int ipt_net_id; +static struct tc_action_ops act_ipt_ops; static int xt_net_id; +static struct tc_action_ops act_xt_ops; -static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) +static int ipt_init_target(struct xt_entry_target *t, char *table, + unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; @@ -89,14 +92,15 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { }; static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, - int bind) + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int ovr, int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; char *tname; - int ret = 0, err, exists = 0; + bool exists = false; + int ret = 0, err; u32 hook = 0; u32 index = 0; @@ -116,19 +120,19 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (!exists) { - ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind, + ret = tcf_hash_create(tn, index, est, a, ops, bind, false); if (ret) return ret; @@ -136,13 +140,11 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - ipt = to_ipt(a); - hook = nla_get_u32(tb[TCA_IPT_HOOK]); err = -ENOMEM; @@ -161,6 +163,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (err < 0) goto err3; + ipt = to_ipt(*a); + spin_lock_bh(&ipt->tcf_lock); if (ret != ACT_P_CREATED) { ipt_destroy_target(ipt->tcfi_t); @@ -172,7 +176,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; err3: @@ -181,33 +185,33 @@ err2: kfree(tname); err1: if (ret == ACT_P_CREATED) - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return err; } static int tcf_ipt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return __tcf_ipt_init(tn, nla, est, a, ovr, bind); + return __tcf_ipt_init(tn, nla, est, a, &act_ipt_ops, ovr, bind); } static int tcf_xt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return __tcf_ipt_init(tn, nla, est, a, ovr, bind); + return __tcf_ipt_init(tn, nla, est, a, &act_xt_ops, ovr, bind); } static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; - struct tcf_ipt *ipt = a->priv; + struct tcf_ipt *ipt = to_ipt(a); struct xt_action_param par; if (skb_unclone(skb, GFP_ATOMIC)) @@ -215,7 +219,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, spin_lock(&ipt->tcf_lock); - ipt->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ipt->tcf_tm); bstats_update(&ipt->tcf_bstats, skb); /* yes, we have to worry about both in and out dev @@ -245,7 +249,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, default: net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", ret); - result = TC_POLICE_OK; + result = TC_ACT_OK; break; } spin_unlock(&ipt->tcf_lock); @@ -253,10 +257,11 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, } -static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ipt *ipt = a->priv; + struct tcf_ipt *ipt = to_ipt(a); struct xt_entry_target *t; struct tcf_t tm; struct tc_cnt c; @@ -280,11 +285,11 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) goto nla_put_failure; - tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); - tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); - tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); + + tcf_tm_dump(&tm, &ipt->tcf_tm); if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; + kfree(t); return skb->len; @@ -296,14 +301,14 @@ nla_put_failure: static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ipt_net_id); @@ -320,6 +325,7 @@ static struct tc_action_ops act_ipt_ops = { .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, + .size = sizeof(struct tcf_ipt), }; static __net_init int ipt_init_net(struct net *net) @@ -345,14 +351,14 @@ static struct pernet_operations ipt_net_ops = { static int tcf_xt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, xt_net_id); @@ -369,6 +375,7 @@ static struct tc_action_ops act_xt_ops = { .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, + .size = sizeof(struct tcf_ipt), }; static __net_init int xt_init_net(struct net *net) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1f5bd6ccb..6038c85d9 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -52,9 +52,10 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { }; static int mirred_net_id; +static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, mirred_net_id); @@ -62,7 +63,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tc_mirred *parm; struct tcf_mirred *m; struct net_device *dev; - int ret, ok_push = 0, exists = 0; + int ret, ok_push = 0; + bool exists = false; if (nla == NULL) return -EINVAL; @@ -83,14 +85,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, break; default: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (parm->ifindex) { dev = __dev_get_by_index(net, parm->ifindex); if (dev == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -ENODEV; } switch (dev->type) { @@ -114,16 +116,16 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) return -EINVAL; ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*m), bind, true); + &act_mirred_ops, bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - m = to_mirred(a); + m = to_mirred(*a); ASSERT_RTNL(); m->tcf_action = parm->action; @@ -141,7 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock_bh(&mirred_list_lock); - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); } return ret; @@ -150,14 +152,13 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_mirred *m = a->priv; + struct tcf_mirred *m = to_mirred(a); struct net_device *dev; struct sk_buff *skb2; int retval, err; u32 at; tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); rcu_read_lock(); @@ -206,7 +207,7 @@ out: static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_mirred *m = a->priv; + struct tcf_mirred *m = to_mirred(a); struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, @@ -219,9 +220,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(m->tcf_tm.expires); + + tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; return skb->len; @@ -233,14 +233,14 @@ nla_put_failure: static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, mirred_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, mirred_net_id); @@ -285,6 +285,7 @@ static struct tc_action_ops act_mirred_ops = { .init = tcf_mirred_init, .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, + .size = sizeof(struct tcf_mirred), }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index c0a879f94..8e8b0cc30 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -32,13 +32,14 @@ #define NAT_TAB_MASK 15 static int nat_net_id; +static struct tc_action_ops act_nat_ops; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) + struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -59,18 +60,18 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_nat_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - p = to_tcf_nat(a); + p = to_tcf_nat(*a); spin_lock_bh(&p->tcf_lock); p->old_addr = parm->old_addr; @@ -82,7 +83,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -90,7 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_nat *p = a->priv; + struct tcf_nat *p = to_tcf_nat(a); struct iphdr *iph; __be32 old_addr; __be32 new_addr; @@ -103,7 +104,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, spin_lock(&p->tcf_lock); - p->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&p->tcf_tm); old_addr = p->old_addr; new_addr = p->new_addr; mask = p->mask; @@ -248,7 +249,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_nat *p = a->priv; + struct tcf_nat *p = to_tcf_nat(a); struct tc_nat opt = { .old_addr = p->old_addr, .new_addr = p->new_addr, @@ -264,9 +265,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; @@ -279,14 +279,14 @@ nla_put_failure: static int tcf_nat_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, nat_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, nat_net_id); @@ -302,6 +302,7 @@ static struct tc_action_ops act_nat_ops = { .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, + .size = sizeof(struct tcf_nat), }; static __net_init int nat_init_net(struct net *net) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index c6e18f230..b54d56d49 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -26,13 +26,14 @@ #define PEDIT_TAB_MASK 15 static int pedit_net_id; +static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; static int tcf_pedit_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, pedit_net_id); @@ -61,23 +62,23 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!parm->nkeys) return -EINVAL; ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_pedit_ops, bind, false); if (ret) return ret; - p = to_pedit(a); + p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return -ENOMEM; } ret = ACT_P_CREATED; } else { if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; - p = to_pedit(a); + p = to_pedit(*a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) @@ -96,13 +97,13 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, memcpy(p->tcfp_keys, parm->keys, ksize); spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } static void tcf_pedit_cleanup(struct tc_action *a, int bind) { - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; kfree(keys); } @@ -110,7 +111,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind) static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); int i; unsigned int off; @@ -121,7 +122,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, spin_lock(&p->tcf_lock); - p->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&p->tcf_tm); if (p->tcfp_nkeys > 0) { struct tc_pedit_key *tkey = p->tcfp_keys; @@ -177,7 +178,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); struct tc_pedit *opt; struct tcf_t t; int s; @@ -200,11 +201,11 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; + kfree(opt); return skb->len; @@ -216,14 +217,14 @@ nla_put_failure: static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, pedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, pedit_net_id); @@ -240,6 +241,7 @@ static struct tc_action_ops act_pedit_ops = { .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, + .size = sizeof(struct tcf_pedit), }; static __net_init int pedit_init_net(struct net *net) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index c55778976..8a3be1d99 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -23,7 +23,7 @@ #include struct tcf_police { - struct tcf_common common; + struct tc_action common; int tcfp_result; u32 tcfp_ewma_rate; s64 tcfp_burst; @@ -37,8 +37,8 @@ struct tcf_police { struct psched_ratecfg peak; bool peak_present; }; -#define to_police(pc) \ - container_of(pc->priv, struct tcf_police, common) + +#define to_police(pc) ((struct tcf_police *)pc) #define POL_TAB_MASK 15 @@ -56,56 +56,15 @@ struct tc_police_compat { /* Each policer is serialized by its individual spinlock */ static int police_net_id; +static struct tc_action_ops act_police_ops; static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, police_net_id); - struct tcf_hashinfo *hinfo = tn->hinfo; - struct hlist_head *head; - struct tcf_common *p; - int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; - struct nlattr *nest; - - spin_lock_bh(&hinfo->lock); - - s_i = cb->args[0]; - - for (i = 0; i < (POL_TAB_MASK + 1); i++) { - head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; - - hlist_for_each_entry_rcu(p, head, tcfc_head) { - index++; - if (index < s_i) - continue; - a->priv = p; - a->order = index; - nest = nla_nest_start(skb, a->order); - if (nest == NULL) - goto nla_put_failure; - if (type == RTM_DELACTION) - err = tcf_action_dump_1(skb, a, 0, 1); - else - err = tcf_action_dump_1(skb, a, 0, 0); - if (err < 0) { - index--; - nla_nest_cancel(skb, nest); - goto done; - } - nla_nest_end(skb, nest); - n_i++; - } - } -done: - spin_unlock_bh(&hinfo->lock); - if (n_i) - cb->args[0] += n_i; - return n_i; -nla_put_failure: - nla_nest_cancel(skb, nest); - goto done; + return tcf_generic_walker(tn, skb, cb, type, ops); } static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { @@ -115,9 +74,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, }; -static int tcf_act_police_locate(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, - int ovr, int bind) +static int tcf_act_police_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind) { int ret = 0, err; struct nlattr *tb[TCA_POLICE_MAX + 1]; @@ -125,6 +84,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; struct tc_action_net *tn = net_generic(net, police_net_id); + bool exists = false; int size; if (nla == NULL) @@ -139,31 +99,25 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, size = nla_len(tb[TCA_POLICE_TBF]); if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) return -EINVAL; + parm = nla_data(tb[TCA_POLICE_TBF]); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; - if (parm->index) { - if (tcf_hash_search(tn, a, parm->index)) { - police = to_police(a); - if (bind) { - police->tcf_bindcnt += 1; - police->tcf_refcnt += 1; - return 0; - } - if (ovr) - goto override; - /* not replacing */ - return -EEXIST; - } - } else { + if (!exists) { ret = tcf_hash_create(tn, parm->index, NULL, a, - sizeof(*police), bind, false); + &act_police_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; + } else { + tcf_hash_release(*a, bind); + if (!ovr) + return -EEXIST; } - police = to_police(a); -override: + police = to_police(*a); if (parm->rate.rate) { err = -ENOMEM; R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]); @@ -182,7 +136,8 @@ override: if (est) { err = gen_replace_estimator(&police->tcf_bstats, NULL, &police->tcf_rate_est, - &police->tcf_lock, est); + &police->tcf_lock, + NULL, est); if (err) goto failure_unlock; } else if (tb[TCA_POLICE_AVRATE] && @@ -234,7 +189,7 @@ override: return ret; police->tcfp_t_c = ktime_get_ns(); - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; @@ -244,14 +199,14 @@ failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return err; } static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_police *police = a->priv; + struct tcf_police *police = to_police(a); s64 now; s64 toks; s64 ptoks = 0; @@ -310,7 +265,7 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_police *police = a->priv; + struct tcf_police *police = to_police(a); struct tc_police opt = { .index = police->tcf_index, .action = police->tcf_action, @@ -336,6 +291,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(police->tcf_tm.expires); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; @@ -347,7 +303,7 @@ nla_put_failure: return -1; } -static int tcf_police_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_police_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, police_net_id); @@ -364,9 +320,10 @@ static struct tc_action_ops act_police_ops = { .owner = THIS_MODULE, .act = tcf_act_police, .dump = tcf_act_police_dump, - .init = tcf_act_police_locate, + .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, + .size = sizeof(struct tcf_police), }; static __net_init int police_init_net(struct net *net) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index e42f8daca..289af6f9b 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -27,15 +27,16 @@ #define SIMP_TAB_MASK 7 static int simp_net_id; +static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_defact *d = a->priv; + struct tcf_defact *d = to_defact(a); spin_lock(&d->tcf_lock); - d->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&d->tcf_tm); bstats_update(&d->tcf_bstats, skb); /* print policy string followed by _ then packet count @@ -79,15 +80,16 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { }; static int tcf_simp_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; + bool exists = false; + int ret = 0, err; char *defdata; - int ret = 0, err, exists = 0; if (nla == NULL) return -EINVAL; @@ -99,7 +101,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; - parm = nla_data(tb[TCA_DEF_PARMS]); exists = tcf_hash_check(tn, parm->index, a, bind); if (exists && bind) @@ -107,7 +108,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (tb[TCA_DEF_DATA] == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } @@ -115,22 +116,22 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*d), bind, false); + &act_simp_ops, bind, false); if (ret) return ret; - d = to_defact(a); + d = to_defact(*a); ret = alloc_defdata(d, defdata); if (ret < 0) { - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return ret; } d->tcf_action = parm->action; ret = ACT_P_CREATED; } else { - d = to_defact(a); + d = to_defact(*a); - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; @@ -138,7 +139,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -146,7 +147,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_defact *d = a->priv; + struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, .refcnt = d->tcf_refcnt - ref, @@ -158,9 +159,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + + tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; return skb->len; @@ -172,14 +172,14 @@ nla_put_failure: static int tcf_simp_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, simp_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, simp_net_id); @@ -196,6 +196,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .walk = tcf_simp_walker, .lookup = tcf_simp_search, + .size = sizeof(struct tcf_defact), }; static __net_init int simp_init_net(struct net *net) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index e92880296..a133dcb82 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -30,14 +30,15 @@ #define SKBEDIT_TAB_MASK 15 static int skbedit_net_id; +static struct tc_action_ops act_skbedit_ops; static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_skbedit *d = a->priv; + struct tcf_skbedit *d = to_skbedit(a); spin_lock(&d->tcf_lock); - d->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&d->tcf_tm); bstats_update(&d->tcf_bstats, skb); if (d->flags & SKBEDIT_F_PRIORITY) @@ -47,6 +48,8 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, skb_set_queue_mapping(skb, d->queue_mapping); if (d->flags & SKBEDIT_F_MARK) skb->mark = d->mark; + if (d->flags & SKBEDIT_F_PTYPE) + skb->pkt_type = d->ptype; spin_unlock(&d->tcf_lock); return d->tcf_action; @@ -57,10 +60,11 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -68,8 +72,9 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tc_skbedit *parm; struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL; - u16 *queue_mapping = NULL; - int ret = 0, err, exists = 0; + u16 *queue_mapping = NULL, *ptype = NULL; + bool exists = false; + int ret = 0, err; if (nla == NULL) return -EINVAL; @@ -91,6 +96,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); } + if (tb[TCA_SKBEDIT_PTYPE] != NULL) { + ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]); + if (!skb_pkt_type_ok(*ptype)) + return -EINVAL; + flags |= SKBEDIT_F_PTYPE; + } + if (tb[TCA_SKBEDIT_MARK] != NULL) { flags |= SKBEDIT_F_MARK; mark = nla_data(tb[TCA_SKBEDIT_MARK]); @@ -103,21 +115,21 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, return 0; if (!flags) { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*d), bind, false); + &act_skbedit_ops, bind, false); if (ret) return ret; - d = to_skbedit(a); + d = to_skbedit(*a); ret = ACT_P_CREATED; } else { - d = to_skbedit(a); - tcf_hash_release(a, bind); + d = to_skbedit(*a); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } @@ -131,13 +143,15 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, d->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) d->mark = *mark; + if (flags & SKBEDIT_F_PTYPE) + d->ptype = *ptype; d->tcf_action = parm->action; spin_unlock_bh(&d->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -145,7 +159,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbedit *d = a->priv; + struct tcf_skbedit *d = to_skbedit(a); struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = d->tcf_refcnt - ref, @@ -157,20 +171,19 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if ((d->flags & SKBEDIT_F_PRIORITY) && - nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), - &d->priority)) + nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority)) goto nla_put_failure; if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && - nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING, - sizeof(d->queue_mapping), &d->queue_mapping)) + nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping)) goto nla_put_failure; if ((d->flags & SKBEDIT_F_MARK) && - nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), - &d->mark)) + nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + if ((d->flags & SKBEDIT_F_PTYPE) && + nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) + goto nla_put_failure; + + tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; return skb->len; @@ -182,14 +195,14 @@ nla_put_failure: static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -205,6 +218,7 @@ static struct tc_action_ops act_skbedit_ops = { .init = tcf_skbedit_init, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, + .size = sizeof(struct tcf_skbedit), }; static __net_init int skbedit_init_net(struct net *net) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ac4adc812..691409de3 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -22,16 +22,17 @@ #define VLAN_TAB_MASK 15 static int vlan_net_id; +static struct tc_action_ops act_vlan_ops; static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_vlan *v = a->priv; + struct tcf_vlan *v = to_vlan(a); int action; int err; spin_lock(&v->tcf_lock); - v->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&v->tcf_tm); bstats_update(&v->tcf_bstats, skb); action = v->tcf_action; @@ -67,7 +68,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, vlan_net_id); @@ -77,8 +78,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, int action; __be16 push_vid = 0; __be16 push_proto = 0; - int ret = 0, exists = 0; - int err; + bool exists = false; + int ret = 0, err; if (!nla) return -EINVAL; @@ -100,13 +101,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, case TCA_VLAN_ACT_PUSH: if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -ERANGE; } @@ -125,25 +126,25 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, break; default: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } action = parm->v_action; if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*v), bind, false); + &act_vlan_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - v = to_vlan(a); + v = to_vlan(*a); spin_lock_bh(&v->tcf_lock); @@ -156,7 +157,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&v->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -164,7 +165,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_vlan *v = a->priv; + struct tcf_vlan *v = to_vlan(a); struct tc_vlan opt = { .index = v->tcf_index, .refcnt = v->tcf_refcnt - ref, @@ -179,12 +180,11 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, if (v->tcfv_action == TCA_VLAN_ACT_PUSH && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || - nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto))) + nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, + v->tcfv_push_proto))) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(v->tcf_tm.expires); + tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; return skb->len; @@ -196,14 +196,14 @@ nla_put_failure: static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, vlan_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, vlan_net_id); @@ -219,6 +219,7 @@ static struct tc_action_ops act_vlan_ops = { .init = tcf_vlan_init, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, + .size = sizeof(struct tcf_vlan), }; static __net_init int vlan_init_net(struct net *net) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a75864d93..a7c564537 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -103,6 +103,17 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, unsigned long fh, int event); +static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, + struct tcf_proto __rcu **chain, int event) +{ + struct tcf_proto __rcu **it_chain; + struct tcf_proto *tp; + + for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL; + it_chain = &tp->next) + tfilter_notify(net, oskb, n, tp, 0, event); +} /* Select new prio value from the range, managed by kernel. */ @@ -156,11 +167,23 @@ replay: cl = 0; if (prio == 0) { - /* If no priority is given, user wants we allocated it. */ - if (n->nlmsg_type != RTM_NEWTFILTER || - !(n->nlmsg_flags & NLM_F_CREATE)) + switch (n->nlmsg_type) { + case RTM_DELTFILTER: + if (protocol || t->tcm_handle || tca[TCA_KIND]) + return -ENOENT; + break; + case RTM_NEWTFILTER: + /* If no priority is provided by the user, + * we allocate one. + */ + if (n->nlmsg_flags & NLM_F_CREATE) { + prio = TC_H_MAKE(0x80000000U, 0U); + break; + } + /* fall-through */ + default: return -ENOENT; - prio = TC_H_MAKE(0x80000000U, 0U); + } } /* Find head of filter chain. */ @@ -200,6 +223,12 @@ replay: err = -EINVAL; if (chain == NULL) goto errout; + if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { + tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); + tcf_destroy_chain(chain); + err = 0; + goto errout; + } /* Check the chain for existence of proto-tcf with this priority */ for (back = chain; @@ -351,8 +380,9 @@ errout: return err; } -static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, - unsigned long fh, u32 portid, u32 seq, u16 flags, int event) +static int tcf_fill_node(struct net *net, struct sk_buff *skb, + struct tcf_proto *tp, unsigned long fh, u32 portid, + u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -474,9 +504,11 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) TC_H_MIN(tcm->tcm_info) != tp->protocol) continue; if (t > s_t) - memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + memset(&cb->args[1], 0, + sizeof(cb->args)-sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid, + if (tcf_fill_node(net, skb, tp, 0, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) break; @@ -509,8 +541,12 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); - INIT_LIST_HEAD(&exts->actions); + LIST_HEAD(actions); + + tcf_exts_to_list(exts, &actions); + tcf_action_destroy(&actions, TCA_ACT_UNBIND); + kfree(exts->actions); + exts->nr_actions = 0; #endif } EXPORT_SYMBOL(tcf_exts_destroy); @@ -522,7 +558,6 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, { struct tc_action *act; - INIT_LIST_HEAD(&exts->actions); if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tb[exts->police], rate_tlv, "police", ovr, @@ -531,14 +566,20 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, return PTR_ERR(act); act->type = exts->type = TCA_OLD_COMPAT; - list_add(&act->list, &exts->actions); + exts->actions[0] = act; + exts->nr_actions = 1; } else if (exts->action && tb[exts->action]) { - int err; + LIST_HEAD(actions); + int err, i = 0; + err = tcf_action_init(net, tb[exts->action], rate_tlv, NULL, ovr, - TCA_ACT_BIND, &exts->actions); + TCA_ACT_BIND, &actions); if (err) return err; + list_for_each_entry(act, &actions, list) + exts->actions[i++] = act; + exts->nr_actions = i; } } #else @@ -555,37 +596,49 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT - LIST_HEAD(tmp); + struct tcf_exts old = *dst; + tcf_tree_lock(tp); - list_splice_init(&dst->actions, &tmp); - list_splice(&src->actions, &dst->actions); + dst->nr_actions = src->nr_actions; + dst->actions = src->actions; dst->type = src->type; tcf_tree_unlock(tp); - tcf_action_destroy(&tmp, TCA_ACT_UNBIND); + + tcf_exts_destroy(&old); #endif } EXPORT_SYMBOL(tcf_exts_change); -#define tcf_exts_first_act(ext) \ - list_first_entry_or_null(&(exts)->actions, \ - struct tc_action, list) +#ifdef CONFIG_NET_CLS_ACT +static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts) +{ + if (exts->nr_actions == 0) + return NULL; + else + return exts->actions[0]; +} +#endif int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct nlattr *nest; - if (exts->action && !list_empty(&exts->actions)) { + if (exts->action && exts->nr_actions) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { + LIST_HEAD(actions); + nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0) + + tcf_exts_to_list(exts, &actions); + if (tcf_action_dump(skb, &actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 7b342c779..c3002c2c6 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -272,15 +272,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); - fp = bpf_prog_get(bpf_fd); + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); if (IS_ERR(fp)) return PTR_ERR(fp); - if (fp->type != BPF_PROG_TYPE_SCHED_CLS) { - bpf_prog_put(fp); - return -EINVAL; - } - if (tb[TCA_BPF_NAME]) { name = kmemdup(nla_data(tb[TCA_BPF_NAME]), nla_len(tb[TCA_BPF_NAME]), diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b3b7978f4..5060801a2 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -66,6 +66,7 @@ struct cls_fl_filter { struct fl_flow_key key; struct list_head list; u32 handle; + u32 flags; struct rcu_head rcu; }; @@ -123,6 +124,9 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; + if (!atomic_read(&head->ht.nelems)) + return -1; + fl_clear_masked_range(&skb_key, &head->mask); skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, @@ -136,7 +140,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = rhashtable_lookup_fast(&head->ht, fl_key_get_start(&skb_mkey, &head->mask), head->ht_params); - if (f) { + if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); } @@ -183,19 +187,20 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie) dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); } -static void fl_hw_replace_filter(struct tcf_proto *tp, - struct flow_dissector *dissector, - struct fl_flow_key *mask, - struct fl_flow_key *key, - struct tcf_exts *actions, - unsigned long cookie, u32 flags) +static int fl_hw_replace_filter(struct tcf_proto *tp, + struct flow_dissector *dissector, + struct fl_flow_key *mask, + struct fl_flow_key *key, + struct tcf_exts *actions, + unsigned long cookie, u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload offload = {0}; struct tc_to_netdev tc; + int err; if (!tc_should_offload(dev, tp, flags)) - return; + return tc_skip_sw(flags) ? -EINVAL : 0; offload.command = TC_CLSFLOWER_REPLACE; offload.cookie = cookie; @@ -207,7 +212,12 @@ static void fl_hw_replace_filter(struct tcf_proto *tp, tc.type = TC_SETUP_CLSFLOWER; tc.cls_flower = &offload; - dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + + if (tc_skip_sw(flags)) + return err; + + return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) @@ -524,7 +534,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_filter *fnew; struct nlattr *tb[TCA_FLOWER_MAX + 1]; struct fl_flow_mask mask = {}; - u32 flags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -552,8 +561,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } fnew->handle = handle; - if (tb[TCA_FLOWER_FLAGS]) - flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); + if (tb[TCA_FLOWER_FLAGS]) { + fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); + + if (!tc_flags_valid(fnew->flags)) { + err = -EINVAL; + goto errout; + } + } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) @@ -563,19 +578,23 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout; - err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, - head->ht_params); + if (!tc_skip_sw(fnew->flags)) { + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, + head->ht_params); + if (err) + goto errout; + } + + err = fl_hw_replace_filter(tp, + &head->dissector, + &mask.key, + &fnew->key, + &fnew->exts, + (unsigned long)fnew, + fnew->flags); if (err) goto errout; - fl_hw_replace_filter(tp, - &head->dissector, - &mask.key, - &fnew->key, - &fnew->exts, - (unsigned long)fnew, - flags); - if (fold) { rhashtable_remove_fast(&head->ht, &fold->ht_node, head->ht_params); @@ -734,6 +753,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->tp.dst)))) goto nla_put_failure; + nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); + if (tcf_exts_dump(skb, &f->exts)) goto nla_put_failure; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c new file mode 100644 index 000000000..25927b6c4 --- /dev/null +++ b/net/sched/cls_matchall.c @@ -0,0 +1,318 @@ +/* + * net/sched/cls_matchll.c Match-all classifier + * + * Copyright (c) 2016 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include + +#include +#include + +struct cls_mall_filter { + struct tcf_exts exts; + struct tcf_result res; + u32 handle; + struct rcu_head rcu; + u32 flags; +}; + +struct cls_mall_head { + struct cls_mall_filter *filter; + struct rcu_head rcu; +}; + +static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_mall_head *head = rcu_dereference_bh(tp->root); + struct cls_mall_filter *f = head->filter; + + if (tc_skip_sw(f->flags)) + return -1; + + return tcf_exts_exec(skb, &f->exts, res); +} + +static int mall_init(struct tcf_proto *tp) +{ + struct cls_mall_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + + rcu_assign_pointer(tp->root, head); + + return 0; +} + +static void mall_destroy_filter(struct rcu_head *head) +{ + struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu); + + tcf_exts_destroy(&f->exts); + + kfree(f); +} + +static int mall_replace_hw_filter(struct tcf_proto *tp, + struct cls_mall_filter *f, + unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_to_netdev offload; + struct tc_cls_matchall_offload mall_offload = {0}; + + offload.type = TC_SETUP_MATCHALL; + offload.cls_mall = &mall_offload; + offload.cls_mall->command = TC_CLSMATCHALL_REPLACE; + offload.cls_mall->exts = &f->exts; + offload.cls_mall->cookie = cookie; + + return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &offload); +} + +static void mall_destroy_hw_filter(struct tcf_proto *tp, + struct cls_mall_filter *f, + unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_to_netdev offload; + struct tc_cls_matchall_offload mall_offload = {0}; + + offload.type = TC_SETUP_MATCHALL; + offload.cls_mall = &mall_offload; + offload.cls_mall->command = TC_CLSMATCHALL_DESTROY; + offload.cls_mall->exts = NULL; + offload.cls_mall->cookie = cookie; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &offload); +} + +static bool mall_destroy(struct tcf_proto *tp, bool force) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct net_device *dev = tp->q->dev_queue->dev; + struct cls_mall_filter *f = head->filter; + + if (!force && f) + return false; + + if (f) { + if (tc_should_offload(dev, tp, f->flags)) + mall_destroy_hw_filter(tp, f, (unsigned long) f); + + call_rcu(&f->rcu, mall_destroy_filter); + } + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); + return true; +} + +static unsigned long mall_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct cls_mall_filter *f = head->filter; + + if (f && f->handle == handle) + return (unsigned long) f; + return 0; +} + +static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { + [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 }, +}; + +static int mall_set_parms(struct net *net, struct tcf_proto *tp, + struct cls_mall_filter *f, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts e; + int err; + + tcf_exts_init(&e, TCA_MATCHALL_ACT, 0); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + return err; + + if (tb[TCA_MATCHALL_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +} + +static int mall_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg; + struct net_device *dev = tp->q->dev_queue->dev; + struct cls_mall_filter *f; + struct nlattr *tb[TCA_MATCHALL_MAX + 1]; + u32 flags = 0; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + if (head->filter) + return -EBUSY; + + if (fold) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_MATCHALL_MAX, + tca[TCA_OPTIONS], mall_policy); + if (err < 0) + return err; + + if (tb[TCA_MATCHALL_FLAGS]) { + flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]); + if (!tc_flags_valid(flags)) + return -EINVAL; + } + + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOBUFS; + + tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0); + + if (!handle) + handle = 1; + f->handle = handle; + f->flags = flags; + + err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); + if (err) + goto errout; + + if (tc_should_offload(dev, tp, flags)) { + err = mall_replace_hw_filter(tp, f, (unsigned long) f); + if (err) { + if (tc_skip_sw(flags)) + goto errout; + else + err = 0; + } + } + + *arg = (unsigned long) f; + rcu_assign_pointer(head->filter, f); + + return 0; + +errout: + kfree(f); + return err; +} + +static int mall_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct cls_mall_filter *f = (struct cls_mall_filter *) arg; + struct net_device *dev = tp->q->dev_queue->dev; + + if (tc_should_offload(dev, tp, f->flags)) + mall_destroy_hw_filter(tp, f, (unsigned long) f); + + RCU_INIT_POINTER(head->filter, NULL); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, mall_destroy_filter); + return 0; +} + +static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct cls_mall_filter *f = head->filter; + + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) f, arg) < 0) + arg->stop = 1; +skip: + arg->count++; +} + +static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_mall_filter *f = (struct cls_mall_filter *) fh; + struct nlattr *nest; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_MATCHALL_CLASSID, f->res.classid)) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_mall_ops __read_mostly = { + .kind = "matchall", + .classify = mall_classify, + .init = mall_init, + .destroy = mall_destroy, + .get = mall_get, + .change = mall_change, + .delete = mall_delete, + .walk = mall_walk, + .dump = mall_dump, + .owner = THIS_MODULE, +}; + +static int __init cls_mall_init(void) +{ + return register_tcf_proto_ops(&cls_mall_ops); +} + +static void __exit cls_mall_exit(void) +{ + unregister_tcf_proto_ops(&cls_mall_ops); +} + +module_init(cls_mall_init); +module_exit(cls_mall_exit); + +MODULE_AUTHOR("Jiri Pirko "); +MODULE_DESCRIPTION("Match-all classifier"); +MODULE_LICENSE("GPL v2"); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ddf047df5..12ebde845 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -95,8 +95,6 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, Expected action: do not backoff, but wait until queue will clear. NET_XMIT_CN - probably this packet enqueued, but another one dropped. Expected action: backoff or ignore - NET_XMIT_POLICED - dropped by police. - Expected action: backoff or error to real-time apps. Auxiliary routines: @@ -583,7 +581,6 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) timer); rcu_read_lock(); - qdisc_unthrottled(wd->qdisc); __netif_schedule(qdisc_root(wd->qdisc)); rcu_read_unlock(); @@ -598,15 +595,12 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - if (throttle) - qdisc_throttled(wd->qdisc); - if (wd->last_expires == expires) return; @@ -620,7 +614,6 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); - qdisc_unthrottled(wd->qdisc); } EXPORT_SYMBOL(qdisc_watchdog_cancel); @@ -982,7 +975,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { - spinlock_t *root_lock; + seqcount_t *running; err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) @@ -991,14 +984,15 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, if ((sch->parent != TC_H_ROOT) && !(sch->flags & TCQ_F_INGRESS) && (!p || !(p->flags & TCQ_F_MQROOT))) - root_lock = qdisc_root_sleeping_lock(sch); + running = qdisc_root_sleeping_running(sch); else - root_lock = qdisc_lock(sch); + running = &sch->running; err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, - root_lock, + NULL, + running, tca[TCA_RATE]); if (err) goto err_out4; @@ -1061,7 +1055,8 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) gen_replace_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); } out: @@ -1369,8 +1364,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d, - TCA_PAD) < 0) + NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) @@ -1381,7 +1375,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, cpu_qstats = q->cpu_qstats; } - if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), + &d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; @@ -1684,8 +1679,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d, - TCA_PAD) < 0) + NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 1911af3ca..481e4f12a 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -357,16 +357,17 @@ static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch, /* --------------------------- Qdisc operations ---------------------------- */ -static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; struct tcf_result res; int result; - int ret = NET_XMIT_POLICED; + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); - result = TC_POLICE_OK; /* be nice to gcc */ + result = TC_ACT_OK; /* be nice to gcc */ flow = NULL; if (TC_H_MAJ(skb->priority) != sch->handle || !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) { @@ -398,12 +399,12 @@ done: switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: - kfree_skb(skb); + __qdisc_drop(skb, to_free); goto drop; - case TC_POLICE_RECLASSIFY: + case TC_ACT_RECLASSIFY: if (flow->excess) flow = flow->excess; else @@ -413,7 +414,7 @@ done: #endif } - ret = qdisc_enqueue(skb, flow->q); + ret = qdisc_enqueue(skb, flow->q, to_free); if (ret != NET_XMIT_SUCCESS) { drop: __maybe_unused if (net_xmit_drop_count(ret)) { @@ -519,20 +520,6 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch) return p->link.q->ops->peek(p->link.q); } -static unsigned int atm_tc_drop(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - unsigned int len; - - pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) - return len; - } - return 0; -} - static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) { struct atm_qdisc_data *p = qdisc_priv(sch); @@ -637,7 +624,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &flow->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; @@ -671,7 +659,6 @@ static struct Qdisc_ops atm_qdisc_ops __read_mostly = { .enqueue = atm_tc_enqueue, .dequeue = atm_tc_dequeue, .peek = atm_tc_peek, - .drop = atm_tc_drop, .init = atm_tc_init, .reset = atm_tc_reset, .destroy = atm_tc_destroy, diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index 3fee70d98..c98a61e98 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -17,9 +17,10 @@ #include #include -static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_SUCCESS; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index baafddf22..beb554aa8 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -80,10 +80,6 @@ struct cbq_class { unsigned char priority; /* class priority */ unsigned char priority2; /* priority to be used after overlimit */ unsigned char ewma_log; /* time constant for idle time calculation */ - unsigned char ovl_strategy; -#ifdef CONFIG_NET_CLS_ACT - unsigned char police; -#endif u32 defmap; @@ -94,10 +90,6 @@ struct cbq_class { u32 avpkt; struct qdisc_rate_table *R_tab; - /* Overlimit strategy parameters */ - void (*overlimit)(struct cbq_class *cl); - psched_tdiff_t penalty; - /* General scheduler (WRR) parameters */ long allot; long quantum; /* Allotment per WRR round */ @@ -353,7 +345,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) { int toplevel = q->toplevel; - if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { + if (toplevel > cl->level) { psched_time_t now = psched_get_time(); do { @@ -366,7 +358,8 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) } static int -cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct cbq_sched_data *q = qdisc_priv(sch); int uninitialized_var(ret); @@ -378,14 +371,11 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } -#ifdef CONFIG_NET_CLS_ACT - cl->q->__parent = sch; -#endif - ret = qdisc_enqueue(skb, cl->q); + ret = qdisc_enqueue(skb, cl->q, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; cbq_mark_toplevel(q, cl); @@ -402,11 +392,8 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; } -/* Overlimit actions */ - -/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ - -static void cbq_ovl_classic(struct cbq_class *cl) +/* Overlimit action: penalize leaf class by adding offtime */ +static void cbq_overlimit(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); psched_tdiff_t delay = cl->undertime - q->now; @@ -456,99 +443,6 @@ static void cbq_ovl_classic(struct cbq_class *cl) } } -/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when - * they go overlimit - */ - -static void cbq_ovl_rclassic(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - struct cbq_class *this = cl; - - do { - if (cl->level > q->toplevel) { - cl = NULL; - break; - } - } while ((cl = cl->borrow) != NULL); - - if (cl == NULL) - cl = this; - cbq_ovl_classic(cl); -} - -/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ - -static void cbq_ovl_delay(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = cl->undertime - q->now; - - if (test_bit(__QDISC_STATE_DEACTIVATED, - &qdisc_root_sleeping(cl->qdisc)->state)) - return; - - if (!cl->delayed) { - psched_time_t sched = q->now; - ktime_t expires; - - delay += cl->offtime; - if (cl->avgidle < 0) - delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); - if (cl->avgidle < cl->minidle) - cl->avgidle = cl->minidle; - cl->undertime = q->now + delay; - - if (delay > 0) { - sched += delay + cl->penalty; - cl->penalized = sched; - cl->cpriority = TC_CBQ_MAXPRIO; - q->pmask |= (1<delay_timer) && - ktime_to_ns(ktime_sub( - hrtimer_get_expires(&q->delay_timer), - expires)) > 0) - hrtimer_set_expires(&q->delay_timer, expires); - hrtimer_restart(&q->delay_timer); - cl->delayed = 1; - cl->xstats.overactions++; - return; - } - delay = 1; - } - if (q->wd_expires == 0 || q->wd_expires > delay) - q->wd_expires = delay; -} - -/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ - -static void cbq_ovl_lowprio(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - - cl->penalized = q->now + cl->penalty; - - if (cl->cpriority != cl->priority2) { - cl->cpriority = cl->priority2; - q->pmask |= (1<cpriority); - cl->xstats.overactions++; - } - cbq_ovl_classic(cl); -} - -/* TC_CBQ_OVL_DROP: penalize class by dropping */ - -static void cbq_ovl_drop(struct cbq_class *cl) -{ - if (cl->q->ops->drop) - if (cl->q->ops->drop(cl->q)) - cl->qdisc->q.qlen--; - cl->xstats.overactions++; - cbq_ovl_classic(cl); -} - static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, psched_time_t now) { @@ -620,45 +514,10 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED); } - qdisc_unthrottled(sch); __netif_schedule(qdisc_root(sch)); return HRTIMER_NORESTART; } -#ifdef CONFIG_NET_CLS_ACT -static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) -{ - struct Qdisc *sch = child->__parent; - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = q->rx_class; - - q->rx_class = NULL; - - if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { - int ret; - - cbq_mark_toplevel(q, cl); - - q->rx_class = cl; - cl->q->__parent = sch; - - ret = qdisc_enqueue(skb, cl->q); - if (ret == NET_XMIT_SUCCESS) { - sch->q.qlen++; - if (!cl->next_alive) - cbq_activate_class(cl); - return 0; - } - if (net_xmit_drop_count(ret)) - qdisc_qstats_drop(sch); - return 0; - } - - qdisc_qstats_drop(sch); - return -1; -} -#endif - /* * It is mission critical procedure. * @@ -807,7 +666,7 @@ cbq_under_limit(struct cbq_class *cl) cl = cl->borrow; if (!cl) { this_cl->qstats.overlimits++; - this_cl->overlimit(this_cl); + cbq_overlimit(this_cl); return NULL; } if (cl->level > q->toplevel) @@ -960,7 +819,6 @@ cbq_dequeue(struct Qdisc *sch) if (skb) { qdisc_bstats_update(sch, skb); sch->q.qlen--; - qdisc_unthrottled(sch); return skb; } @@ -1166,31 +1024,6 @@ static void cbq_link_class(struct cbq_class *this) } } -static unsigned int cbq_drop(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl, *cl_head; - int prio; - unsigned int len; - - for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { - cl_head = q->active[prio]; - if (!cl_head) - continue; - - cl = cl_head; - do { - if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { - sch->q.qlen--; - if (!cl->q->q.qlen) - cbq_deactivate_class(cl); - return len; - } - } while ((cl = cl->next_alive) != cl_head); - } - return 0; -} - static void cbq_reset(struct Qdisc *sch) { @@ -1280,50 +1113,6 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) return 0; } -static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) -{ - switch (ovl->strategy) { - case TC_CBQ_OVL_CLASSIC: - cl->overlimit = cbq_ovl_classic; - break; - case TC_CBQ_OVL_DELAY: - cl->overlimit = cbq_ovl_delay; - break; - case TC_CBQ_OVL_LOWPRIO: - if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO || - ovl->priority2 - 1 <= cl->priority) - return -EINVAL; - cl->priority2 = ovl->priority2 - 1; - cl->overlimit = cbq_ovl_lowprio; - break; - case TC_CBQ_OVL_DROP: - cl->overlimit = cbq_ovl_drop; - break; - case TC_CBQ_OVL_RCLASSIC: - cl->overlimit = cbq_ovl_rclassic; - break; - default: - return -EINVAL; - } - cl->penalty = ovl->penalty; - return 0; -} - -#ifdef CONFIG_NET_CLS_ACT -static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) -{ - cl->police = p->police; - - if (cl->q->handle) { - if (p->police == TC_POLICE_RECLASSIFY) - cl->q->reshape_fail = cbq_reshape_fail; - else - cl->q->reshape_fail = NULL; - } - return 0; -} -#endif - static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) { cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); @@ -1375,8 +1164,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->link.priority = TC_CBQ_MAXPRIO - 1; q->link.priority2 = TC_CBQ_MAXPRIO - 1; q->link.cpriority = TC_CBQ_MAXPRIO - 1; - q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; - q->link.overlimit = cbq_ovl_classic; q->link.allot = psched_mtu(qdisc_dev(sch)); q->link.quantum = q->link.allot; q->link.weight = q->link.R_tab->rate.rate; @@ -1463,24 +1250,6 @@ nla_put_failure: return -1; } -static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_ovl opt; - - opt.strategy = cl->ovl_strategy; - opt.priority2 = cl->priority2 + 1; - opt.pad = 0; - opt.penalty = cl->penalty; - if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); @@ -1500,36 +1269,11 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NET_CLS_ACT -static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_police opt; - - if (cl->police) { - opt.police = cl->police; - opt.__res1 = 0; - opt.__res2 = 0; - if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt)) - goto nla_put_failure; - } - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} -#endif - static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) { if (cbq_dump_lss(skb, cl) < 0 || cbq_dump_rate(skb, cl) < 0 || cbq_dump_wrr(skb, cl) < 0 || - cbq_dump_ovl(skb, cl) < 0 || -#ifdef CONFIG_NET_CLS_ACT - cbq_dump_police(skb, cl) < 0 || -#endif cbq_dump_fopt(skb, cl) < 0) return -1; return 0; @@ -1600,7 +1344,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) return -1; @@ -1618,11 +1363,6 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, &pfifo_qdisc_ops, cl->common.classid); if (new == NULL) return -ENOBUFS; - } else { -#ifdef CONFIG_NET_CLS_ACT - if (cl->police == TC_POLICE_RECLASSIFY) - new->reshape_fail = cbq_reshape_fail; -#endif } *old = qdisc_replace(sch, new, &cl->q); @@ -1735,6 +1475,9 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (err < 0) return err; + if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) + return -EOPNOTSUPP; + if (cl) { /* Check parent */ if (parentid) { @@ -1755,7 +1498,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { qdisc_put_rtab(rtab); @@ -1782,14 +1526,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); } - if (tb[TCA_CBQ_OVL_STRATEGY]) - cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); - -#ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE]) - cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); -#endif - if (tb[TCA_CBQ_FOPT]) cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); @@ -1848,7 +1584,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { kfree(cl); @@ -1884,13 +1621,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->maxidle = q->link.maxidle; if (cl->avpkt == 0) cl->avpkt = q->link.avpkt; - cl->overlimit = cbq_ovl_classic; - if (tb[TCA_CBQ_OVL_STRATEGY]) - cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); -#ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE]) - cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); -#endif if (tb[TCA_CBQ_FOPT]) cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); sch_tree_unlock(sch); @@ -2035,7 +1765,6 @@ static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { .enqueue = cbq_enqueue, .dequeue = cbq_dequeue, .peek = qdisc_peek_dequeued, - .drop = cbq_drop, .init = cbq_init, .reset = cbq_reset, .destroy = cbq_destroy, diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 0a08c860e..3b6d5bd69 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -115,7 +115,8 @@ static void choke_zap_tail_holes(struct choke_sched_data *q) } /* Drop packet from queue array by creating a "hole" */ -static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) +static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, + struct sk_buff **to_free) { struct choke_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = q->tab[idx]; @@ -129,7 +130,7 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) qdisc_qstats_backlog_dec(sch, skb); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); --sch->q.qlen; } @@ -261,7 +262,8 @@ static bool choke_match_random(const struct choke_sched_data *q, return choke_match_flow(oskb, nskb); } -static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; struct choke_sched_data *q = qdisc_priv(sch); @@ -288,7 +290,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Draw a packet at random from queue and compare flow */ if (choke_match_random(q, skb, &idx)) { q->stats.matched++; - choke_drop_by_idx(sch, idx); + choke_drop_by_idx(sch, idx, to_free); goto congestion_drop; } @@ -331,16 +333,16 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q->stats.pdrop++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } @@ -365,22 +367,6 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch) return skb; } -static unsigned int choke_drop(struct Qdisc *sch) -{ - struct choke_sched_data *q = qdisc_priv(sch); - unsigned int len; - - len = qdisc_queue_drop(sch); - if (len > 0) - q->stats.other++; - else { - if (!red_is_idling(&q->vars)) - red_start_of_idle_period(&q->vars); - } - - return len; -} - static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); @@ -391,11 +377,11 @@ static void choke_reset(struct Qdisc *sch) q->head = (q->head + 1) & q->tab_mask; if (!skb) continue; - qdisc_qstats_backlog_dec(sch, skb); - --sch->q.qlen; - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } + sch->q.qlen = 0; + sch->qstats.backlog = 0; memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); q->head = q->tail = 0; red_restart(&q->vars); @@ -471,7 +457,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped); q->head = 0; @@ -569,7 +555,6 @@ static struct Qdisc_ops choke_qdisc_ops __read_mostly = { .enqueue = choke_enqueue, .dequeue = choke_dequeue, .peek = choke_peek_head, - .drop = choke_drop, .init = choke_init, .destroy = choke_destroy, .reset = choke_reset, diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index dddf3bb65..4002df3c7 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -82,7 +82,8 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - qdisc_drop(skb, sch); + kfree_skb(skb); + qdisc_qstats_drop(sch); } static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) @@ -107,7 +108,8 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) return skb; } -static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct codel_sched_data *q; @@ -117,7 +119,7 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q = qdisc_priv(sch); q->drop_overlimit++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { @@ -174,7 +176,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index bf8af2c43..8af5c59ee 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -91,7 +91,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -119,7 +120,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { qdisc_destroy(cl->qdisc); @@ -279,7 +281,8 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (qlen) xstats.deficit = cl->deficit; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) return -1; @@ -347,7 +350,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, return NULL; } -static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; @@ -357,11 +361,11 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; @@ -420,27 +424,6 @@ out: return NULL; } -static unsigned int drr_drop(struct Qdisc *sch) -{ - struct drr_sched *q = qdisc_priv(sch); - struct drr_class *cl; - unsigned int len; - - list_for_each_entry(cl, &q->active, alist) { - if (cl->qdisc->ops->drop) { - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - sch->qstats.backlog -= len; - sch->q.qlen--; - if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); - return len; - } - } - } - return 0; -} - static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct drr_sched *q = qdisc_priv(sch); @@ -510,7 +493,6 @@ static struct Qdisc_ops drr_qdisc_ops __read_mostly = { .enqueue = drr_enqueue, .dequeue = drr_dequeue, .peek = qdisc_peek_dequeued, - .drop = drr_drop, .init = drr_init_qdisc, .reset = drr_reset_qdisc, .destroy = drr_destroy_qdisc, diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 34b4ddaca..1308bbf46 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -191,7 +191,8 @@ static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch, /* --------------------------- Qdisc operations ---------------------------- */ -static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; @@ -234,7 +235,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) #ifdef CONFIG_NET_CLS_ACT case TC_ACT_QUEUED: case TC_ACT_STOLEN: - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: @@ -251,7 +252,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } - err = qdisc_enqueue(skb, p->q); + err = qdisc_enqueue(skb, p->q, to_free); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); @@ -264,7 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -320,23 +321,6 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch) return p->q->ops->peek(p->q); } -static unsigned int dsmark_drop(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - unsigned int len; - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - if (p->q->ops->drop == NULL) - return 0; - - len = p->q->ops->drop(p->q); - if (len) - sch->q.qlen--; - - return len; -} - static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) { struct dsmark_qdisc_data *p = qdisc_priv(sch); @@ -489,7 +473,6 @@ static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { .enqueue = dsmark_enqueue, .dequeue = dsmark_dequeue, .peek = dsmark_peek, - .drop = dsmark_drop, .init = dsmark_init, .reset = dsmark_reset, .destroy = dsmark_destroy, diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 2e4bd2c0a..baeed6a78 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,23 +19,26 @@ /* 1 band FIFO pseudo-"scheduler" */ -static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch, to_free); } -static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch, to_free); } -static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { unsigned int prev_backlog; @@ -44,7 +47,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) prev_backlog = sch->qstats.backlog; /* queue full, remove one skb to fulfill the limit */ - __qdisc_queue_drop_head(sch, &sch->q); + __qdisc_queue_drop_head(sch, &sch->q, to_free); qdisc_qstats_drop(sch); qdisc_enqueue_tail(skb, sch); @@ -103,7 +106,6 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .enqueue = pfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, @@ -118,7 +120,6 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .enqueue = bfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, @@ -133,7 +134,6 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { .enqueue = pfifo_tail_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop_head, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 3c6a47d66..e5458b99e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -368,18 +368,19 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) } } -static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct fq_sched_data *q = qdisc_priv(sch); struct fq_flow *f; if (unlikely(sch->q.qlen >= sch->limit)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); f = fq_classify(skb, q); if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { q->stat_flows_plimit++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } f->qlen++; @@ -445,8 +446,7 @@ begin: if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow, - false); + q->time_next_delayed_flow); return NULL; } } @@ -515,17 +515,25 @@ out: return skb; } +static void fq_flow_purge(struct fq_flow *flow) +{ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; + flow->qlen = 0; +} + static void fq_reset(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); struct rb_root *root; - struct sk_buff *skb; struct rb_node *p; struct fq_flow *f; unsigned int idx; - while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) - kfree_skb(skb); + sch->q.qlen = 0; + sch->qstats.backlog = 0; + + fq_flow_purge(&q->internal); if (!q->fq_root) return; @@ -536,8 +544,7 @@ static void fq_reset(struct Qdisc *sch) f = container_of(p, struct fq_flow, fq_node); rb_erase(p, root); - while ((skb = fq_dequeue_head(sch, f)) != NULL) - kfree_skb(skb); + fq_flow_purge(f); kmem_cache_free(fq_flow_cachep, f); } @@ -738,7 +745,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (!skb) break; drop_len += qdisc_pkt_len(skb); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); drop_count++; } qdisc_tree_reduce_backlog(sch, drop_count, drop_len); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index da250b2e0..a5ea0e9b6 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -139,7 +139,8 @@ static inline void flow_queue_add(struct fq_codel_flow *flow, skb->next = NULL; } -static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) +static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, + struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; @@ -171,8 +172,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) do { skb = dequeue_head(flow); len += qdisc_pkt_len(skb); - mem += skb->truesize; - kfree_skb(skb); + mem += get_codel_cb(skb)->mem_usage; + __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); flow->dropped += i; @@ -184,16 +185,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) return idx; } -static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) -{ - unsigned int prev_backlog; - - prev_backlog = sch->qstats.backlog; - fq_codel_drop(sch, 1U); - return prev_backlog - sch->qstats.backlog; -} - -static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); unsigned int idx, prev_backlog, prev_qlen; @@ -206,7 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } idx--; @@ -223,7 +216,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow->deficit = q->quantum; flow->dropped = 0; } - q->memory_usage += skb->truesize; + get_codel_cb(skb)->mem_usage = skb->truesize; + q->memory_usage += get_codel_cb(skb)->mem_usage; memory_limited = q->memory_usage > q->memory_limit; if (++sch->q.qlen <= sch->limit && !memory_limited) return NET_XMIT_SUCCESS; @@ -238,7 +232,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) * So instead of dropping a single packet, drop half of its backlog * with a 64 packets limit to not add a too big cpu spike here. */ - ret = fq_codel_drop(sch, q->drop_batch_size); + ret = fq_codel_drop(sch, q->drop_batch_size, to_free); prev_qlen -= sch->q.qlen; prev_backlog -= sch->qstats.backlog; @@ -274,7 +268,7 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) if (flow->head) { skb = dequeue_head(flow); q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); - q->memory_usage -= skb->truesize; + q->memory_usage -= get_codel_cb(skb)->mem_usage; sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); } @@ -285,7 +279,8 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - qdisc_drop(skb, sch); + kfree_skb(skb); + qdisc_qstats_drop(sch); } static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) @@ -345,6 +340,12 @@ begin: return skb; } +static void fq_codel_flow_purge(struct fq_codel_flow *flow) +{ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; +} + static void fq_codel_reset(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -355,18 +356,13 @@ static void fq_codel_reset(struct Qdisc *sch) for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; - while (flow->head) { - struct sk_buff *skb = dequeue_head(flow); - - qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); - } - + fq_codel_flow_purge(flow); INIT_LIST_HEAD(&flow->flowchain); codel_vars_init(&flow->cvars); } memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); sch->q.qlen = 0; + sch->qstats.backlog = 0; q->memory_usage = 0; } @@ -442,7 +438,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) struct sk_buff *skb = fq_codel_dequeue(sch); q->cstats.drop_len += qdisc_pkt_len(skb); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); q->cstats.drop_count++; } qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); @@ -578,11 +574,13 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.memory_usage = q->memory_usage; st.qdisc_stats.drop_overmemory = q->drop_overmemory; + sch_tree_lock(sch); list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; list_for_each(pos, &q->old_flows) st.qdisc_stats.old_flows_len++; + sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); } @@ -636,7 +634,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (idx < q->flows_cnt) { const struct fq_codel_flow *flow = &q->flows[idx]; - const struct sk_buff *skb = flow->head; + const struct sk_buff *skb; memset(&xstats, 0, sizeof(xstats)); xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; @@ -654,9 +652,14 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, codel_time_to_us(delta) : -codel_time_to_us(-delta); } - while (skb) { - qs.qlen++; - skb = skb->next; + if (flow->head) { + sch_tree_lock(sch); + skb = flow->head; + while (skb) { + qs.qlen++; + skb = skb->next; + } + sch_tree_unlock(sch); } qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; @@ -709,7 +712,6 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .enqueue = fq_codel_enqueue, .dequeue = fq_codel_dequeue, .peek = qdisc_peek_dequeued, - .drop = fq_codel_qdisc_drop, .init = fq_codel_init, .reset = fq_codel_reset, .destroy = fq_codel_destroy, diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f9e0e9c03..657c13362 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -77,6 +77,34 @@ static void try_bulk_dequeue_skb(struct Qdisc *q, skb->next = NULL; } +/* This variant of try_bulk_dequeue_skb() makes sure + * all skbs in the chain are for the same txq + */ +static void try_bulk_dequeue_skb_slow(struct Qdisc *q, + struct sk_buff *skb, + int *packets) +{ + int mapping = skb_get_queue_mapping(skb); + struct sk_buff *nskb; + int cnt = 0; + + do { + nskb = q->dequeue(q); + if (!nskb) + break; + if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { + q->skb_bad_txq = nskb; + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + break; + } + skb->next = nskb; + skb = nskb; + } while (++cnt < 8); + (*packets) += cnt; + skb->next = NULL; +} + /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ @@ -87,8 +115,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, const struct netdev_queue *txq = q->dev_queue; *packets = 1; - *validate = true; if (unlikely(skb)) { + /* skb in gso_skb were already validated */ + *validate = false; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { @@ -97,22 +126,37 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, q->q.qlen--; } else skb = NULL; - /* skb in gso_skb were already validated */ - *validate = false; - } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || - !netif_xmit_frozen_or_stopped(txq)) { - skb = q->dequeue(q); - if (skb && qdisc_may_bulk(q)) - try_bulk_dequeue_skb(q, skb, txq, packets); + return skb; + } + *validate = true; + skb = q->skb_bad_txq; + if (unlikely(skb)) { + /* check the reason of requeuing without tx lock first */ + txq = skb_get_tx_queue(txq->dev, skb); + if (!netif_xmit_frozen_or_stopped(txq)) { + q->skb_bad_txq = NULL; + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + goto bulk; } + return NULL; + } + if (!(q->flags & TCQ_F_ONETXQUEUE) || + !netif_xmit_frozen_or_stopped(txq)) + skb = q->dequeue(q); + if (skb) { +bulk: + if (qdisc_may_bulk(q)) + try_bulk_dequeue_skb(q, skb, txq, packets); + else + try_bulk_dequeue_skb_slow(q, skb, packets); } return skb; } /* * Transmit possibly several skbs, and handle the return status as - * required. Holding the __QDISC___STATE_RUNNING bit guarantees that + * required. Owning running seqcount bit guarantees that * only one CPU can execute this function. * * Returns to the caller: @@ -165,7 +209,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * - * __QDISC___STATE_RUNNING guarantees only one CPU can process + * running seqcount guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * @@ -348,9 +392,10 @@ EXPORT_SYMBOL(netif_carrier_off); cheaper. */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, + struct sk_buff **to_free) { - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_CN; } @@ -381,6 +426,7 @@ struct Qdisc noop_qdisc = { .list = LIST_HEAD_INIT(noop_qdisc.list), .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, + .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), }; EXPORT_SYMBOL(noop_qdisc); @@ -438,7 +484,8 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, return priv->q + band; } -static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, + struct sk_buff **to_free) { if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { int band = prio2band[skb->priority & TC_PRIO_MAX]; @@ -450,7 +497,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) return __qdisc_enqueue_tail(skb, qdisc, list); } - return qdisc_drop(skb, qdisc); + return qdisc_drop(skb, qdisc, to_free); } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) @@ -492,7 +539,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(qdisc, band2list(priv, prio)); + __qdisc_reset_queue(band2list(priv, prio)); priv->bitmap = 0; qdisc->qstats.backlog = 0; @@ -539,6 +586,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops) @@ -572,6 +620,10 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); + sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; @@ -591,18 +643,19 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, struct Qdisc *sch; if (!try_module_get(ops->owner)) - goto errout; + return NULL; sch = qdisc_alloc(dev_queue, ops); - if (IS_ERR(sch)) - goto errout; + if (IS_ERR(sch)) { + module_put(ops->owner); + return NULL; + } sch->parent = parentid; if (!ops->init || ops->init(sch, NULL) == 0) return sch; qdisc_destroy(sch); -errout: return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); @@ -616,11 +669,14 @@ void qdisc_reset(struct Qdisc *qdisc) if (ops->reset) ops->reset(qdisc); + kfree_skb(qdisc->skb_bad_txq); + qdisc->skb_bad_txq = NULL; + if (qdisc->gso_skb) { kfree_skb_list(qdisc->gso_skb); qdisc->gso_skb = NULL; - qdisc->q.qlen = 0; } + qdisc->q.qlen = 0; } EXPORT_SYMBOL(qdisc_reset); @@ -659,6 +715,7 @@ void qdisc_destroy(struct Qdisc *qdisc) dev_put(qdisc_dev(qdisc)); kfree_skb_list(qdisc->gso_skb); + kfree_skb(qdisc->skb_bad_txq); /* * gen_estimator est_timer() might access qdisc->q.lock, * wait a RCU grace period before freeing qdisc. diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 80105109f..c78a093c5 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -149,7 +149,8 @@ static inline int gred_use_harddrop(struct gred_sched *t) return t->red_flags & TC_RED_HARDDROP; } -static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct gred_sched_data *q = NULL; struct gred_sched *t = qdisc_priv(sch); @@ -237,10 +238,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->stats.pdrop++; drop: - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } @@ -276,40 +277,6 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) return NULL; } -static unsigned int gred_drop(struct Qdisc *sch) -{ - struct sk_buff *skb; - struct gred_sched *t = qdisc_priv(sch); - - skb = qdisc_dequeue_tail(sch); - if (skb) { - unsigned int len = qdisc_pkt_len(skb); - struct gred_sched_data *q; - u16 dp = tc_index_to_dp(skb); - - if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n", - tc_index_to_dp(skb)); - } else { - q->backlog -= len; - q->stats.other++; - - if (gred_wred_mode(t)) { - if (!sch->qstats.backlog) - red_start_of_idle_period(&t->wred_set); - } else { - if (!q->backlog) - red_start_of_idle_period(&q->vars); - } - } - - qdisc_drop(skb, sch); - return len; - } - - return 0; -} - static void gred_reset(struct Qdisc *sch) { int i; @@ -623,7 +590,6 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .enqueue = gred_enqueue, .dequeue = gred_dequeue, .peek = qdisc_peek_head, - .drop = gred_drop, .init = gred_init, .reset = gred_reset, .destroy = gred_destroy, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 1ac9f9f03..3ddc7bd74 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -115,9 +115,9 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; struct gnet_stats_rate_est64 rate_est; - unsigned int level; /* class level in hierarchy */ struct tcf_proto __rcu *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ + unsigned int level; /* class level in hierarchy */ struct hfsc_sched *sched; /* scheduler data */ struct hfsc_class *cl_parent; /* parent class */ @@ -130,7 +130,6 @@ struct hfsc_class { struct rb_node vt_node; /* parent's vt_tree member */ struct rb_root cf_tree; /* active children sorted by cl_f */ struct rb_node cf_node; /* parent's cf_heap member */ - struct list_head dlist; /* drop list member */ u64 cl_total; /* total work in bytes */ u64 cl_cumul; /* cumulative work in bytes done by @@ -166,10 +165,10 @@ struct hfsc_class { struct runtime_sc cl_virtual; /* virtual curve */ struct runtime_sc cl_ulimit; /* upperlimit curve */ - unsigned long cl_flags; /* which curves are valid */ - unsigned long cl_vtperiod; /* vt period sequence number */ - unsigned long cl_parentperiod;/* parent's vt period sequence number*/ - unsigned long cl_nactive; /* number of active children */ + u8 cl_flags; /* which curves are valid */ + u32 cl_vtperiod; /* vt period sequence number */ + u32 cl_parentperiod;/* parent's vt period sequence number*/ + u32 cl_nactive; /* number of active children */ }; struct hfsc_sched { @@ -177,8 +176,6 @@ struct hfsc_sched { struct hfsc_class root; /* root class */ struct Qdisc_class_hash clhash; /* class hash */ struct rb_root eligible; /* eligible tree */ - struct list_head droplist; /* active leaf class list (for - dropping) */ struct qdisc_watchdog watchdog; /* watchdog timer */ }; @@ -781,6 +778,20 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) else go_passive = 0; + /* update vt */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + if (go_passive) { /* no more active child, going passive */ @@ -797,25 +808,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) continue; } - /* - * update vt and f - */ - cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) - - cl->cl_vtoff + cl->cl_vtadj; - - /* - * if vt of the class is smaller than cvtmin, - * the class was skipped in the past due to non-fit. - * if so, we need to adjust vtadj. - */ - if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { - cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; - cl->cl_vt = cl->cl_parent->cl_cvtmin; - } - /* update the vt tree */ vttree_update(cl); + /* update f */ if (cl->cl_flags & HFSC_USC) { cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); @@ -858,7 +854,6 @@ set_active(struct hfsc_class *cl, unsigned int len) if (cl->cl_flags & HFSC_FSC) init_vf(cl, len); - list_add_tail(&cl->dlist, &cl->sched->droplist); } static void @@ -867,8 +862,6 @@ set_passive(struct hfsc_class *cl) if (cl->cl_flags & HFSC_RSC) eltree_remove(cl); - list_del(&cl->dlist); - /* * vttree is now handled in update_vf() so that update_vf(cl, 0, 0) * needs to be called explicitly to remove a class from vttree. @@ -882,7 +875,7 @@ qdisc_peek_len(struct Qdisc *sch) unsigned int len; skb = sch->ops->peek(sch); - if (skb == NULL) { + if (unlikely(skb == NULL)) { qdisc_warn_nonwc("qdisc_peek_len", sch); return 0; } @@ -947,7 +940,7 @@ static void hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) { sc2isc(fsc, &cl->cl_fsc); - rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vtoff + cl->cl_vt, cl->cl_total); cl->cl_flags |= HFSC_FSC; } @@ -1015,11 +1008,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cur_time = psched_get_time(); if (tca[TCA_RATE]) { - spinlock_t *lock = qdisc_root_sleeping_lock(sch); - err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - lock, + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -1068,7 +1060,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { kfree(cl); @@ -1373,7 +1366,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; @@ -1443,7 +1436,6 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; q->eligible = RB_ROOT; - INIT_LIST_HEAD(&q->droplist); q->root.cl_common.classid = sch->handle; q->root.refcnt = 1; @@ -1527,7 +1519,6 @@ hfsc_reset_qdisc(struct Qdisc *sch) hfsc_reset_class(cl); } q->eligible = RB_ROOT; - INIT_LIST_HEAD(&q->droplist); qdisc_watchdog_cancel(&q->watchdog); sch->qstats.backlog = 0; sch->q.qlen = 0; @@ -1572,7 +1563,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) } static int -hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hfsc_class *cl; int uninitialized_var(err); @@ -1581,11 +1572,11 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; @@ -1594,8 +1585,17 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) return err; } - if (cl->qdisc->q.qlen == 1) + if (cl->qdisc->q.qlen == 1) { set_active(cl, qdisc_pkt_len(skb)); + /* + * If this is the first packet, isolate the head so an eventual + * head drop before the first dequeue operation has no chance + * to invalidate the deadline. + */ + if (cl->cl_flags & HFSC_RSC) + cl->qdisc->ops->peek(cl->qdisc); + + } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -1664,7 +1664,6 @@ hfsc_dequeue(struct Qdisc *sch) set_passive(cl); } - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; @@ -1672,32 +1671,6 @@ hfsc_dequeue(struct Qdisc *sch) return skb; } -static unsigned int -hfsc_drop(struct Qdisc *sch) -{ - struct hfsc_sched *q = qdisc_priv(sch); - struct hfsc_class *cl; - unsigned int len; - - list_for_each_entry(cl, &q->droplist, dlist) { - if (cl->qdisc->ops->drop != NULL && - (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { - if (cl->qdisc->q.qlen == 0) { - update_vf(cl, 0, 0); - set_passive(cl); - } else { - list_move_tail(&cl->dlist, &q->droplist); - } - cl->qstats.drops++; - qdisc_qstats_drop(sch); - sch->qstats.backlog -= len; - sch->q.qlen--; - return len; - } - } - return 0; -} - static const struct Qdisc_class_ops hfsc_class_ops = { .change = hfsc_change_class, .delete = hfsc_delete_class, @@ -1724,7 +1697,6 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { .enqueue = hfsc_enqueue, .dequeue = hfsc_dequeue, .peek = qdisc_peek_dequeued, - .drop = hfsc_drop, .cl_ops = &hfsc_class_ops, .priv_size = sizeof(struct hfsc_sched), .owner = THIS_MODULE diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 13d6f83ec..e3d0458af 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -345,7 +345,7 @@ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) skb->next = NULL; } -static unsigned int hhf_drop(struct Qdisc *sch) +static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; @@ -359,25 +359,16 @@ static unsigned int hhf_drop(struct Qdisc *sch) struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; - qdisc_qstats_drop(sch); qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); + qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } -static unsigned int hhf_qdisc_drop(struct Qdisc *sch) -{ - unsigned int prev_backlog; - - prev_backlog = sch->qstats.backlog; - hhf_drop(sch); - return prev_backlog - sch->qstats.backlog; -} - -static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; @@ -415,7 +406,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Return Congestion Notification only if we dropped a packet from this * bucket. */ - if (hhf_drop(sch) == idx) + if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ @@ -473,7 +464,7 @@ static void hhf_reset(struct Qdisc *sch) struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } static void *hhf_zalloc(size_t sz) @@ -583,7 +574,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = hhf_dequeue(sch); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, prev_backlog - sch->qstats.backlog); @@ -709,7 +700,6 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, - .drop = hhf_qdisc_drop, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 052f84d6c..53dbfa187 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -117,7 +117,6 @@ struct htb_class { * Written often fields */ struct gnet_stats_basic_packed bstats; - struct gnet_stats_queue qstats; struct tc_htb_xstats xstats; /* our special stats */ /* token bucket parameters */ @@ -140,6 +139,8 @@ struct htb_class { enum htb_cmode cmode; /* current mode of the class */ struct rb_node pq_node; /* node for event queue */ struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ + + unsigned int drops ____cacheline_aligned_in_smp; }; struct htb_level { @@ -569,7 +570,8 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) list_del_init(&cl->un.leaf.drop_list); } -static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { int uninitialized_var(ret); struct htb_sched *q = qdisc_priv(sch); @@ -581,19 +583,20 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) __skb_queue_tail(&q->direct_queue, skb); q->direct_pkts++; } else { - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; #endif - } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) { + } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q, + to_free)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { qdisc_qstats_drop(sch); - cl->qstats.drops++; + cl->drops++; } return ret; } else { @@ -889,7 +892,6 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) if (skb != NULL) { ok: qdisc_bstats_update(sch, skb); - qdisc_unthrottled(sch); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; @@ -929,38 +931,13 @@ ok: } qdisc_qstats_overlimit(sch); if (likely(next_event > q->now)) - qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true); + qdisc_watchdog_schedule_ns(&q->watchdog, next_event); else schedule_work(&q->work); fin: return skb; } -/* try to drop from each class (by prio) until one succeed */ -static unsigned int htb_drop(struct Qdisc *sch) -{ - struct htb_sched *q = qdisc_priv(sch); - int prio; - - for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { - struct list_head *p; - list_for_each(p, q->drops + prio) { - struct htb_class *cl = list_entry(p, struct htb_class, - un.leaf.drop_list); - unsigned int len; - if (cl->un.leaf.q->ops->drop && - (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { - sch->qstats.backlog -= len; - sch->q.qlen--; - if (!cl->un.leaf.q->q.qlen) - htb_deactivate(q, cl); - return len; - } - } - } - return 0; -} - /* reset all classes */ /* always caled under BH & queue lock */ static void htb_reset(struct Qdisc *sch) @@ -983,7 +960,7 @@ static void htb_reset(struct Qdisc *sch) } } qdisc_watchdog_cancel(&q->watchdog); - __skb_queue_purge(&q->direct_queue); + __qdisc_reset_queue(&q->direct_queue); sch->q.qlen = 0; sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); @@ -1136,18 +1113,24 @@ static int htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct htb_class *cl = (struct htb_class *)arg; + struct gnet_stats_queue qs = { + .drops = cl->drops, + }; __u32 qlen = 0; - if (!cl->level && cl->un.leaf.q) + if (!cl->level && cl->un.leaf.q) { qlen = cl->un.leaf.q->q.qlen; + qs.backlog = cl->un.leaf.q->qstats.backlog; + } cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), INT_MIN, INT_MAX); cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens), INT_MIN, INT_MAX); - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) + gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); @@ -1260,7 +1243,7 @@ static void htb_destroy(struct Qdisc *sch) htb_destroy_class(sch, cl); } qdisc_class_hash_destroy(&q->clhash); - __skb_queue_purge(&q->direct_queue); + __qdisc_reset_queue(&q->direct_queue); } static int htb_delete(struct Qdisc *sch, unsigned long arg) @@ -1399,7 +1382,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (htb_rate_est || tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE] ? : &est.nla); if (err) { kfree(cl); @@ -1461,11 +1445,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent->children++; } else { if (tca[TCA_RATE]) { - spinlock_t *lock = qdisc_root_sleeping_lock(sch); - err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - lock, + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -1603,7 +1586,6 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .enqueue = htb_enqueue, .dequeue = htb_dequeue, .peek = qdisc_peek_dequeued, - .drop = htb_drop, .init = htb_init, .reset = htb_reset, .destroy = htb_destroy, diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 56a77b878..b9439827c 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -199,7 +199,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b8002ce3d..549c66359 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -342,7 +342,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, * hold here is the look on dev_queue->qdisc_sleeping * also acquired below. */ - spin_unlock_bh(d->lock); + if (d->lock) + spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); @@ -359,15 +360,17 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, spin_unlock_bh(qdisc_lock(qdisc)); } /* Reclaim root sleeping lock before completing stats */ - spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || + if (d->lock) + spin_lock_bh(d->lock); + if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index bcdd54bb1..9ffbb025b 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -65,7 +65,8 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } static int -multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; @@ -76,12 +77,12 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } #endif - ret = qdisc_enqueue(skb, qdisc); + ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -151,27 +152,6 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch) } -static unsigned int multiq_drop(struct Qdisc *sch) -{ - struct multiq_sched_data *q = qdisc_priv(sch); - int band; - unsigned int len; - struct Qdisc *qdisc; - - for (band = q->bands - 1; band >= 0; band--) { - qdisc = q->queues[band]; - if (qdisc->ops->drop) { - len = qdisc->ops->drop(qdisc); - if (len != 0) { - sch->q.qlen--; - return len; - } - } - } - return 0; -} - - static void multiq_reset(struct Qdisc *sch) { @@ -356,7 +336,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; @@ -415,7 +396,6 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .enqueue = multiq_enqueue, .dequeue = multiq_dequeue, .peek = multiq_peek, - .drop = multiq_drop, .init = multiq_init, .reset = multiq_reset, .destroy = multiq_destroy, diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 178f1630a..aaaf02175 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -368,9 +368,7 @@ static void tfifo_reset(struct Qdisc *sch) struct sk_buff *skb = netem_rb_to_skb(p); rb_erase(p, &q->t_root); - skb->next = NULL; - skb->prev = NULL; - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } } @@ -399,7 +397,8 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) * when we statistically choose to corrupt one, we instead segment it, returning * the first packet to be corrupted, and re-enqueue the remaining frames */ -static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) +static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); @@ -407,7 +406,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { - qdisc_reshape_fail(skb, sch); + qdisc_drop(skb, sch, to_free); return NULL; } consume_skb(skb); @@ -420,7 +419,8 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) * NET_XMIT_DROP: queue length didn't change. * NET_XMIT_SUCCESS: one skb was queued. */ -static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ @@ -445,7 +445,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (count == 0) { qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -465,7 +465,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; - rootq->enqueue(skb2, rootq); + rootq->enqueue(skb2, rootq, to_free); q->duplicate = dupsave; } @@ -477,7 +477,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { if (skb_is_gso(skb)) { - segs = netem_segment(skb, sch); + segs = netem_segment(skb, sch, to_free); if (!segs) return NET_XMIT_DROP; } else { @@ -487,10 +487,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb = segs; segs = segs->next; - if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || - (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) { - rc = qdisc_drop(skb, sch); + skb = skb_unshare(skb, GFP_ATOMIC); + if (unlikely(!skb)) { + qdisc_qstats_drop(sch); + goto finish_segs; + } + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(skb)) { + qdisc_drop(skb, sch, to_free); goto finish_segs; } @@ -499,7 +503,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch, to_free); qdisc_qstats_backlog_inc(sch, skb); @@ -559,7 +563,7 @@ finish_segs: segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; - rc = qdisc_enqueue(segs, sch); + rc = qdisc_enqueue(segs, sch, to_free); if (rc != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(rc)) qdisc_qstats_drop(sch); @@ -576,50 +580,17 @@ finish_segs: return NET_XMIT_SUCCESS; } -static unsigned int netem_drop(struct Qdisc *sch) -{ - struct netem_sched_data *q = qdisc_priv(sch); - unsigned int len; - - len = qdisc_queue_drop(sch); - - if (!len) { - struct rb_node *p = rb_first(&q->t_root); - - if (p) { - struct sk_buff *skb = netem_rb_to_skb(p); - - rb_erase(p, &q->t_root); - sch->q.qlen--; - skb->next = NULL; - skb->prev = NULL; - qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); - } - } - if (!len && q->qdisc && q->qdisc->ops->drop) - len = q->qdisc->ops->drop(q->qdisc); - if (len) - qdisc_qstats_drop(sch); - - return len; -} - static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; struct rb_node *p; - if (qdisc_is_throttled(sch)) - return NULL; - tfifo_dequeue: skb = __skb_dequeue(&sch->q); if (skb) { qdisc_qstats_backlog_dec(sch, skb); deliver: - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; } @@ -651,8 +622,11 @@ deliver: if (q->qdisc) { unsigned int pkt_len = qdisc_pkt_len(skb); - int err = qdisc_enqueue(skb, q->qdisc); + struct sk_buff *to_free = NULL; + int err; + err = qdisc_enqueue(skb, q->qdisc, &to_free); + kfree_skb_list(to_free); if (err != NET_XMIT_SUCCESS && net_xmit_drop_count(err)) { qdisc_qstats_drop(sch); @@ -1143,7 +1117,6 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .enqueue = netem_enqueue, .dequeue = netem_dequeue, .peek = qdisc_peek_dequeued, - .drop = netem_drop, .init = netem_init, .reset = netem_reset, .destroy = netem_destroy, diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 71ae3b962..a570b0bb2 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -134,7 +134,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) return false; } -static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; @@ -166,7 +167,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) out: q->stats.dropped++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { @@ -234,7 +235,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index 5abfe4467..1c6cbab3e 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -64,6 +64,8 @@ struct plug_sched_data { */ bool unplug_indefinite; + bool throttled; + /* Queue Limit in bytes */ u32 limit; @@ -86,7 +88,8 @@ struct plug_sched_data { u32 pkts_to_release; }; -static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct plug_sched_data *q = qdisc_priv(sch); @@ -96,14 +99,14 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) return qdisc_enqueue_tail(skb, sch); } - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch, to_free); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) { struct plug_sched_data *q = qdisc_priv(sch); - if (qdisc_is_throttled(sch)) + if (q->throttled) return NULL; if (!q->unplug_indefinite) { @@ -111,7 +114,7 @@ static struct sk_buff *plug_dequeue(struct Qdisc *sch) /* No more packets to dequeue. Block the queue * and wait for the next release command. */ - qdisc_throttled(sch); + q->throttled = true; return NULL; } q->pkts_to_release--; @@ -141,7 +144,7 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt) q->limit = ctl->limit; } - qdisc_throttled(sch); + q->throttled = true; return 0; } @@ -173,7 +176,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt) q->pkts_last_epoch = q->pkts_current_epoch; q->pkts_current_epoch = 0; if (q->unplug_indefinite) - qdisc_throttled(sch); + q->throttled = true; q->unplug_indefinite = false; break; case TCQ_PLUG_RELEASE_ONE: @@ -182,7 +185,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt) */ q->pkts_to_release += q->pkts_last_epoch; q->pkts_last_epoch = 0; - qdisc_unthrottled(sch); + q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_RELEASE_INDEFINITE: @@ -190,7 +193,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt) q->pkts_to_release = 0; q->pkts_last_epoch = 0; q->pkts_current_epoch = 0; - qdisc_unthrottled(sch); + q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_LIMIT: diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index a356450b7..8f575899a 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -67,7 +67,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } static int -prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) +prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; @@ -83,7 +83,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) } #endif - ret = qdisc_enqueue(skb, qdisc); + ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -127,25 +127,6 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch) } -static unsigned int prio_drop(struct Qdisc *sch) -{ - struct prio_sched_data *q = qdisc_priv(sch); - int prio; - unsigned int len; - struct Qdisc *qdisc; - - for (prio = q->bands-1; prio >= 0; prio--) { - qdisc = q->queues[prio]; - if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) { - sch->qstats.backlog -= len; - sch->q.qlen--; - return len; - } - } - return 0; -} - - static void prio_reset(struct Qdisc *sch) { @@ -304,7 +285,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; @@ -363,7 +345,6 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .enqueue = prio_enqueue, .dequeue = prio_dequeue, .peek = prio_peek, - .drop = prio_drop, .init = prio_init, .reset = prio_reset, .destroy = prio_destroy, diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index f18857feb..ca0516e6f 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -460,7 +460,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -486,7 +487,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) goto destroy_class; @@ -663,7 +665,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) @@ -1150,6 +1153,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) if (!skb) return NULL; + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_bstats_update(sch, skb); @@ -1214,7 +1218,8 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) return agg; } -static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; @@ -1237,11 +1242,11 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) qdisc_pkt_len(skb)); if (err) { cl->qstats.drops++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { @@ -1252,6 +1257,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } bstats_update(&cl->bstats, skb); + qdisc_qstats_backlog_inc(sch, skb); ++sch->q.qlen; agg = cl->agg; @@ -1422,52 +1428,6 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) qfq_deactivate_class(q, cl); } -static unsigned int qfq_drop_from_slot(struct qfq_sched *q, - struct hlist_head *slot) -{ - struct qfq_aggregate *agg; - struct qfq_class *cl; - unsigned int len; - - hlist_for_each_entry(agg, slot, next) { - list_for_each_entry(cl, &agg->active, alist) { - - if (!cl->qdisc->ops->drop) - continue; - - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - if (cl->qdisc->q.qlen == 0) - qfq_deactivate_class(q, cl); - - return len; - } - } - } - return 0; -} - -static unsigned int qfq_drop(struct Qdisc *sch) -{ - struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; - unsigned int i, j, len; - - for (i = 0; i <= QFQ_MAX_INDEX; i++) { - grp = &q->groups[i]; - for (j = 0; j < QFQ_MAX_SLOTS; j++) { - len = qfq_drop_from_slot(q, &grp->slots[j]); - if (len > 0) { - sch->q.qlen--; - return len; - } - } - - } - - return 0; -} - static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct qfq_sched *q = qdisc_priv(sch); @@ -1518,6 +1478,7 @@ static void qfq_reset_qdisc(struct Qdisc *sch) qdisc_reset(cl->qdisc); } } + sch->qstats.backlog = 0; sch->q.qlen = 0; } @@ -1562,7 +1523,6 @@ static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { .enqueue = qfq_enqueue, .dequeue = qfq_dequeue, .peek = qdisc_peek_dequeued, - .drop = qfq_drop, .init = qfq_init_qdisc, .reset = qfq_reset_qdisc, .destroy = qfq_destroy_qdisc, diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 91578bdd3..249b2a18a 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -56,7 +56,8 @@ static inline int red_use_harddrop(struct red_sched_data *q) return q->flags & TC_RED_HARDDROP; } -static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; @@ -95,7 +96,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; } - ret = qdisc_enqueue(skb, child); + ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -106,7 +107,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } @@ -136,26 +137,6 @@ static struct sk_buff *red_peek(struct Qdisc *sch) return child->ops->peek(child); } -static unsigned int red_drop(struct Qdisc *sch) -{ - struct red_sched_data *q = qdisc_priv(sch); - struct Qdisc *child = q->qdisc; - unsigned int len; - - if (child->ops->drop && (len = child->ops->drop(child)) > 0) { - q->stats.other++; - qdisc_qstats_drop(sch); - sch->qstats.backlog -= len; - sch->q.qlen--; - return len; - } - - if (!red_is_idling(&q->vars)) - red_start_of_idle_period(&q->vars); - - return 0; -} - static void red_reset(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); @@ -365,7 +346,6 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = { .enqueue = red_enqueue, .dequeue = red_dequeue, .peek = red_peek, - .drop = red_drop, .init = red_init, .reset = red_reset, .destroy = red_destroy, diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index c69611640..20a350bd1 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -275,7 +275,8 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, return false; } -static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct sfb_sched_data *q = qdisc_priv(sch); @@ -397,8 +398,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } enqueue: - ret = qdisc_enqueue(skb, child); + ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; increment_qlen(skb, q); } else if (net_xmit_drop_count(ret)) { @@ -408,7 +410,7 @@ enqueue: return ret; drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) @@ -427,6 +429,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch) if (skb) { qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; decrement_qlen(skb, q); } @@ -449,6 +452,7 @@ static void sfb_reset(struct Qdisc *sch) struct sfb_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); + sch->qstats.backlog = 0; sch->q.qlen = 0; q->slot = 0; q->double_buffering = false; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 498f0a2cb..7f195ed4d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -343,7 +343,7 @@ static int sfq_headdrop(const struct sfq_sched_data *q) } static int -sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int hash, dropped; @@ -367,7 +367,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; @@ -424,14 +424,14 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (slot->qlen >= q->maxdepth) { congestion_drop: if (!sfq_headdrop(q)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); /* We know we have at least one packet in queue */ head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; slot->backlog -= delta; - qdisc_drop(head, sch); + qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); return NET_XMIT_CN; @@ -520,7 +520,7 @@ sfq_reset(struct Qdisc *sch) struct sk_buff *skb; while ((skb = sfq_dequeue(sch)) != NULL) - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } /* @@ -896,7 +896,6 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .enqueue = sfq_enqueue, .dequeue = sfq_dequeue, .peek = qdisc_peek_dequeued, - .drop = sfq_drop, .init = sfq_init, .reset = sfq_reset, .destroy = sfq_destroy, diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 3161e4919..303355c44 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -155,7 +155,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ -static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; @@ -166,7 +167,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch, to_free); nb = 0; while (segs) { @@ -174,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; - ret = qdisc_enqueue(segs, q->qdisc); + ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); @@ -190,17 +191,18 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } -static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; if (qdisc_pkt_len(skb) > q->max_size) { if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) - return tbf_segment(skb, sch); - return qdisc_reshape_fail(skb, sch); + return tbf_segment(skb, sch, to_free); + return qdisc_drop(skb, sch, to_free); } - ret = qdisc_enqueue(skb, q->qdisc); + ret = qdisc_enqueue(skb, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); @@ -212,19 +214,6 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } -static unsigned int tbf_drop(struct Qdisc *sch) -{ - struct tbf_sched_data *q = qdisc_priv(sch); - unsigned int len = 0; - - if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { - sch->qstats.backlog -= len; - sch->q.qlen--; - qdisc_qstats_drop(sch); - } - return len; -} - static bool tbf_peak_present(const struct tbf_sched_data *q) { return q->peak.rate_bytes_ps; @@ -267,14 +256,12 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) q->ptokens = ptoks; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; } qdisc_watchdog_schedule_ns(&q->watchdog, - now + max_t(long, -toks, -ptoks), - true); + now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, @@ -559,7 +546,6 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .enqueue = tbf_enqueue, .dequeue = tbf_dequeue, .peek = qdisc_peek_dequeued, - .drop = tbf_drop, .init = tbf_init, .reset = tbf_reset, .destroy = tbf_destroy, diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index e02687185..2cd9b4478 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -77,7 +77,7 @@ struct teql_sched_data { /* "teql*" qdisc routines */ static int -teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) +teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); @@ -87,7 +87,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static struct sk_buff * diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 0fca5824a..6c4f7496c 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -11,7 +11,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ - output.o input.o debug.o ssnmap.o auth.o + output.o input.o debug.o ssnmap.o auth.o \ + offload.o sctp_probe-y := probe.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c index e1849f371..1c23060c4 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -268,6 +268,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a goto fail_init; asoc->active_key_id = ep->active_key_id; + asoc->prsctp_enable = ep->prsctp_enable; /* Save the hmacs and chunks list into this association */ if (ep->auth_hmacs_list) diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 1eb94bf18..0a3dbec0a 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -179,6 +179,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, msg, msg->expires_at, jiffies); } + if (asoc->peer.prsctp_capable && + SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) + msg->expires_at = + jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); + /* This is the biggest possible DATA chunk that can fit into * the packet */ @@ -335,13 +340,32 @@ errout: /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { - struct sctp_datamsg *msg = chunk->msg; + if (!chunk->asoc->peer.prsctp_capable || + !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) { + struct sctp_datamsg *msg = chunk->msg; + + if (!msg->can_abandon) + return 0; + + if (time_after(jiffies, msg->expires_at)) + return 1; - if (!msg->can_abandon) return 0; + } - if (time_after(jiffies, msg->expires_at)) + if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && + time_after(jiffies, chunk->msg->expires_at)) { + if (chunk->sent_count) + chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + else + chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + return 1; + } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && + chunk->sent_count > chunk->sinfo.sinfo_timetolive) { + chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; + } + /* PRIO policy is processed by sendmsg, not here */ return 0; } diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 9d494e35e..1f0306568 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -163,6 +163,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, */ ep->auth_hmacs_list = auth_hmacs; ep->auth_chunk_list = auth_chunks; + ep->prsctp_enable = net->sctp.prsctp_enable; return ep; diff --git a/net/sctp/input.c b/net/sctp/input.c index f09332256..1555fb8c6 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -90,17 +90,6 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) return 0; } -struct sctp_input_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - struct sctp_chunk *chunk; -}; -#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) - /* * This is the routine which IP calls when receiving an SCTP packet. */ @@ -123,31 +112,41 @@ int sctp_rcv(struct sk_buff *skb) __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); - if (skb_linearize(skb)) + /* If packet is too small to contain a single chunk, let's not + * waste time on it anymore. + */ + if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + + skb_transport_offset(skb)) goto discard_it; - /* Pull up the IP and SCTP headers. */ - __skb_pull(skb, skb_transport_offset(skb)); - if (skb->len < sizeof(struct sctphdr)) + /* If the packet is fragmented and we need to do crc checking, + * it's better to just linearize it otherwise crc computing + * takes longer. + */ + if ((!(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) && + skb_linearize(skb)) || + !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; + /* Pull up the IP header. */ + __skb_pull(skb, skb_transport_offset(skb)); + skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); - else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0) + else if (!sctp_checksum_disable && + !(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) && + sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; - skb_pull(skb, sizeof(struct sctphdr)); - - /* Make sure we at least have chunk headers worth of data left. */ - if (skb->len < sizeof(struct sctp_chunkhdr)) - goto discard_it; + __skb_pull(skb, sizeof(struct sctphdr)); family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; + SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); @@ -659,19 +658,23 @@ out_unlock: */ static int sctp_rcv_ootb(struct sk_buff *skb) { - sctp_chunkhdr_t *ch; - __u8 *ch_end; - - ch = (sctp_chunkhdr_t *) skb->data; + sctp_chunkhdr_t *ch, _ch; + int ch_end, offset = 0; /* Scan through all the chunks in the packet. */ do { + /* Make sure we have at least the header there */ + if (offset + sizeof(sctp_chunkhdr_t) > skb->len) + break; + + ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); + /* Break out if chunk length is less then minimal. */ if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); - if (ch_end > skb_tail_pointer(skb)) + ch_end = offset + WORD_ROUND(ntohs(ch->length)); + if (ch_end > skb->len) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the @@ -696,8 +699,8 @@ static int sctp_rcv_ootb(struct sk_buff *skb) if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) goto discard; - ch = (sctp_chunkhdr_t *) ch_end; - } while (ch_end < skb_tail_pointer(skb)); + offset = ch_end; + } while (ch_end < skb->len); return 0; @@ -793,27 +796,34 @@ struct sctp_hash_cmp_arg { static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { + struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; - const struct sctp_transport *t = ptr; - struct sctp_association *asoc = t->asoc; - const struct net *net = x->net; + struct sctp_association *asoc; + int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) - return 1; - if (!net_eq(sock_net(asoc->base.sk), net)) - return 1; + return err; + if (!sctp_transport_hold(t)) + return err; + + asoc = t->asoc; + if (!net_eq(sock_net(asoc->base.sk), x->net)) + goto out; if (x->ep) { if (x->ep != asoc->ep) - return 1; + goto out; } else { if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) - return 1; + goto out; if (!sctp_bind_addr_match(&asoc->base.bind_addr, x->laddr, sctp_sk(asoc->base.sk))) - return 1; + goto out; } - return 0; + err = 0; +out: + sctp_transport_put(t); + return err; } static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed) @@ -1172,6 +1182,14 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, { sctp_chunkhdr_t *ch; + /* We do not allow GSO frames here as we need to linearize and + * then cannot guarantee frame boundaries. This shouldn't be an + * issue as packets hitting this are mostly INIT or INIT-ACK and + * those cannot be on GSO-style anyway. + */ + if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) + return NULL; + ch = (sctp_chunkhdr_t *) skb->data; /* The code below will attempt to walk the chunk and extract diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index b335ffcef..6437aa97c 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -128,13 +128,25 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) * at this time. */ - if ((chunk = queue->in_progress)) { + chunk = queue->in_progress; + if (chunk) { /* There is a packet that we have been working on. * Any post processing work to do before we move on? */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) { + if (chunk->head_skb == chunk->skb) { + chunk->skb = skb_shinfo(chunk->skb)->frag_list; + goto new_skb; + } + if (chunk->skb->next) { + chunk->skb = chunk->skb->next; + goto new_skb; + } + + if (chunk->head_skb) + chunk->skb = chunk->head_skb; sctp_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { @@ -150,34 +162,58 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) if (!chunk) { struct list_head *entry; +next_chunk: /* Is the queue empty? */ - if (list_empty(&queue->in_chunk_list)) + entry = sctp_list_dequeue(&queue->in_chunk_list); + if (!entry) return NULL; - entry = queue->in_chunk_list.next; - chunk = queue->in_progress = - list_entry(entry, struct sctp_chunk, list); - list_del_init(entry); + chunk = list_entry(entry, struct sctp_chunk, list); - /* This is the first chunk in the packet. */ - chunk->singleton = 1; - ch = (sctp_chunkhdr_t *) chunk->skb->data; - chunk->data_accepted = 0; + if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) { + /* GSO-marked skbs but without frags, handle + * them normally + */ + if (skb_shinfo(chunk->skb)->frag_list) + chunk->head_skb = chunk->skb; + + /* skbs with "cover letter" */ + if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) + chunk->skb = skb_shinfo(chunk->skb)->frag_list; + + if (WARN_ON(!chunk->skb)) { + __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); + sctp_chunk_free(chunk); + goto next_chunk; + } + } if (chunk->asoc) sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); + + queue->in_progress = chunk; + +new_skb: + /* This is the first chunk in the packet. */ + ch = (sctp_chunkhdr_t *) chunk->skb->data; + chunk->singleton = 1; + chunk->data_accepted = 0; + chunk->pdiscard = 0; + chunk->auth = 0; + chunk->has_asconf = 0; + chunk->end_of_packet = 0; + if (chunk->head_skb) { + struct sctp_input_cb + *cb = SCTP_INPUT_CB(chunk->skb), + *head_cb = SCTP_INPUT_CB(chunk->head_skb); + + cb->chunk = head_cb->chunk; + cb->af = head_cb->af; + } } chunk->chunk_hdr = ch; chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); - /* In the unlikely case of an IP reassembly, the skb could be - * non-linear. If so, update chunk_end so that it doesn't go past - * the skb->tail. - */ - if (unlikely(skb_is_nonlinear(chunk->skb))) { - if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) - chunk->chunk_end = skb_tail_pointer(chunk->skb); - } skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0657d18a8..f473779e8 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -420,6 +420,7 @@ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; + /* Always called on head skb, so this is safe */ sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; @@ -559,6 +560,7 @@ static int sctp_v6_is_any(const union sctp_addr *addr) static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) { int type; + struct net *net = sock_net(&sp->inet.sk); const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr; type = ipv6_addr_type(in6); @@ -573,7 +575,8 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) if (!(type & IPV6_ADDR_UNICAST)) return 0; - return ipv6_chk_addr(sock_net(&sp->inet.sk), in6, NULL, 0); + return sp->inet.freebind || net->ipv6.sysctl.ip_nonlocal_bind || + ipv6_chk_addr(net, in6, NULL, 0); } /* This function checks if the address is a valid address to be used for @@ -710,8 +713,7 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { - struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb; - return opt->iif; + return IP6CB(skb)->iif; } /* Was this packet marked by Explicit Congestion Notification? */ @@ -780,15 +782,14 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, if (ip_hdr(skb)->version == 4) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = sh->source; - addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; + addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { - struct sctp_ulpevent *ev = sctp_skb2event(skb); - addr->v6.sin6_scope_id = ev->iif; + addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); } } @@ -955,7 +956,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, + .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, diff --git a/net/sctp/offload.c b/net/sctp/offload.c new file mode 100644 index 000000000..7e869d0cc --- /dev/null +++ b/net/sctp/offload.c @@ -0,0 +1,119 @@ +/* + * sctp_offload - GRO/GSO Offloading for SCTP + * + * Copyright (C) 2015, Marcelo Ricardo Leitner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static __le32 sctp_gso_make_checksum(struct sk_buff *skb) +{ + skb->ip_summed = CHECKSUM_NONE; + return sctp_compute_cksum(skb, skb_transport_offset(skb)); +} + +static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct sctphdr *sh; + + sh = sctp_hdr(skb); + if (!pskb_may_pull(skb, sizeof(*sh))) + goto out; + + __skb_pull(skb, sizeof(*sh)); + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + struct skb_shared_info *pinfo = skb_shinfo(skb); + struct sk_buff *frag_iter; + + pinfo->gso_segs = 0; + if (skb->len != skb->data_len) { + /* Means we have chunks in here too */ + pinfo->gso_segs++; + } + + skb_walk_frags(skb, frag_iter) + pinfo->gso_segs++; + + segs = NULL; + goto out; + } + + segs = skb_segment(skb, features | NETIF_F_HW_CSUM); + if (IS_ERR(segs)) + goto out; + + /* All that is left is update SCTP CRC if necessary */ + if (!(features & NETIF_F_SCTP_CRC)) { + for (skb = segs; skb; skb = skb->next) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + sh = sctp_hdr(skb); + sh->checksum = sctp_gso_make_checksum(skb); + } + } + } + +out: + return segs; +} + +static const struct net_offload sctp_offload = { + .callbacks = { + .gso_segment = sctp_gso_segment, + }, +}; + +static const struct net_offload sctp6_offload = { + .callbacks = { + .gso_segment = sctp_gso_segment, + }, +}; + +int __init sctp_offload_init(void) +{ + int ret; + + ret = inet_add_offload(&sctp_offload, IPPROTO_SCTP); + if (ret) + goto out; + + ret = inet6_add_offload(&sctp6_offload, IPPROTO_SCTP); + if (ret) + goto ipv4; + + return ret; + +ipv4: + inet_del_offload(&sctp_offload, IPPROTO_SCTP); +out: + return ret; +} diff --git a/net/sctp/output.c b/net/sctp/output.c index 9844fe573..31b7bc358 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -84,18 +84,42 @@ static void sctp_packet_reset(struct sctp_packet *packet) struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { - struct sctp_chunk *chunk = NULL; + struct sctp_transport *tp = packet->transport; + struct sctp_association *asoc = tp->asoc; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; + if (asoc && tp->dst) { + struct sock *sk = asoc->base.sk; + + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + + if (sk_can_gso(sk)) { + struct net_device *dev = tp->dst->dev; + + packet->max_size = dev->gso_max_size; + } else { + packet->max_size = asoc->pathmtu; + } + rcu_read_unlock(); + + } else { + packet->max_size = tp->pathmtu; + } + if (ecn_capable && sctp_packet_empty(packet)) { - chunk = sctp_get_ecne_prepend(packet->transport->asoc); + struct sctp_chunk *chunk; /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ + chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } @@ -158,7 +182,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, sctp_xmit_t retval; int error = 0; - pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk); + pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__, + packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: @@ -291,6 +316,8 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet, packet->has_data = 1; /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; + /* Mainly used for prsctp RTX policy */ + chunk->sent_count++; break; case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; @@ -381,12 +408,15 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctphdr *sh; - struct sk_buff *nskb; + struct sk_buff *nskb = NULL, *head = NULL; struct sctp_chunk *chunk, *tmp; struct sock *sk; int err = 0; int padding; /* How much padding do we need? */ + int pkt_size; __u8 has_data = 0; + int gso = 0; + int pktcount = 0; struct dst_entry *dst; unsigned char *auth = NULL; /* pointer to auth in skb data */ @@ -400,18 +430,37 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; - /* Allocate the new skb. */ - nskb = alloc_skb(packet->size + MAX_HEADER, gfp); - if (!nskb) + /* Allocate the head skb, or main one if not in GSO */ + if (packet->size > tp->pathmtu && !packet->ipfragok) { + if (sk_can_gso(sk)) { + gso = 1; + pkt_size = packet->overhead; + } else { + /* If this happens, we trash this packet and try + * to build a new one, hopefully correct this + * time. Application may notice this error. + */ + pr_err_once("Trying to GSO but underlying device doesn't support it."); + goto nomem; + } + } else { + pkt_size = packet->size; + } + head = alloc_skb(pkt_size + MAX_HEADER, gfp); + if (!head) goto nomem; + if (gso) { + NAPI_GRO_CB(head)->last = head; + skb_shinfo(head)->gso_type = sk->sk_gso_type; + } /* Make sure the outbound skb has enough header room reserved. */ - skb_reserve(nskb, packet->overhead + MAX_HEADER); + skb_reserve(head, packet->overhead + MAX_HEADER); /* Set the owning socket so that we know where to get the * destination IP address. */ - sctp_packet_set_owner_w(nskb, sk); + sctp_packet_set_owner_w(head, sk); if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sctp_sk(sk)); @@ -422,11 +471,11 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) dst = dst_clone(tp->dst); if (!dst) goto no_route; - skb_dst_set(nskb, dst); + skb_dst_set(head, dst); /* Build the SCTP header. */ - sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); - skb_reset_transport_header(nskb); + sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr)); + skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); @@ -441,90 +490,144 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sh->vtag = htonl(packet->vtag); sh->checksum = 0; - /** - * 6.10 Bundling - * - * An endpoint bundles chunks by simply including multiple - * chunks in one outbound SCTP packet. ... - */ - - /** - * 3.2 Chunk Field Descriptions - * - * The total length of a chunk (including Type, Length and - * Value fields) MUST be a multiple of 4 bytes. If the length - * of the chunk is not a multiple of 4 bytes, the sender MUST - * pad the chunk with all zero bytes and this padding is not - * included in the chunk length field. The sender should - * never pad with more than 3 bytes. - * - * [This whole comment explains WORD_ROUND() below.] - */ - pr_debug("***sctp_transmit_packet***\n"); - list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { - list_del_init(&chunk->list); - if (sctp_chunk_is_data(chunk)) { - /* 6.3.1 C4) When data is in flight and when allowed - * by rule C5, a new RTT measurement MUST be made each - * round trip. Furthermore, new RTT measurements - * SHOULD be made no more than once per round-trip - * for a given destination transport address. - */ + do { + /* Set up convenience variables... */ + chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); + pktcount++; - if (!chunk->resent && !tp->rto_pending) { - chunk->rtt_in_progress = 1; - tp->rto_pending = 1; + /* Calculate packet size, so it fits in PMTU. Leave + * other chunks for the next packets. + */ + if (gso) { + pkt_size = packet->overhead; + list_for_each_entry(chunk, &packet->chunk_list, list) { + int padded = WORD_ROUND(chunk->skb->len); + + if (pkt_size + padded > tp->pathmtu) + break; + pkt_size += padded; } - has_data = 1; - } + /* Allocate a new skb. */ + nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); + if (!nskb) + goto nomem; - padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; - if (padding) - memset(skb_put(chunk->skb, padding), 0, padding); + /* Make sure the outbound skb has enough header + * room reserved. + */ + skb_reserve(nskb, packet->overhead + MAX_HEADER); + } else { + nskb = head; + } - /* if this is the auth chunk that we are adding, - * store pointer where it will be added and put - * the auth into the packet. + /** + * 3.2 Chunk Field Descriptions + * + * The total length of a chunk (including Type, Length and + * Value fields) MUST be a multiple of 4 bytes. If the length + * of the chunk is not a multiple of 4 bytes, the sender MUST + * pad the chunk with all zero bytes and this padding is not + * included in the chunk length field. The sender should + * never pad with more than 3 bytes. + * + * [This whole comment explains WORD_ROUND() below.] */ - if (chunk == packet->auth) - auth = skb_tail_pointer(nskb); - memcpy(skb_put(nskb, chunk->skb->len), + pkt_size -= packet->overhead; + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); + if (sctp_chunk_is_data(chunk)) { + /* 6.3.1 C4) When data is in flight and when allowed + * by rule C5, a new RTT measurement MUST be made each + * round trip. Furthermore, new RTT measurements + * SHOULD be made no more than once per round-trip + * for a given destination transport address. + */ + + if (!chunk->resent && !tp->rto_pending) { + chunk->rtt_in_progress = 1; + tp->rto_pending = 1; + } + + has_data = 1; + } + + padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + if (padding) + memset(skb_put(chunk->skb, padding), 0, padding); + + /* if this is the auth chunk that we are adding, + * store pointer where it will be added and put + * the auth into the packet. + */ + if (chunk == packet->auth) + auth = skb_tail_pointer(nskb); + + memcpy(skb_put(nskb, chunk->skb->len), chunk->skb->data, chunk->skb->len); - pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, " - "rtt_in_progress:%d\n", chunk, - sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), - chunk->has_tsn ? "TSN" : "No TSN", - chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, - ntohs(chunk->chunk_hdr->length), chunk->skb->len, - chunk->rtt_in_progress); - - /* - * If this is a control chunk, this is our last - * reference. Free data chunks after they've been - * acknowledged or have failed. + pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", + chunk, + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), + chunk->has_tsn ? "TSN" : "No TSN", + chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, + ntohs(chunk->chunk_hdr->length), chunk->skb->len, + chunk->rtt_in_progress); + + /* If this is a control chunk, this is our last + * reference. Free data chunks after they've been + * acknowledged or have failed. + * Re-queue auth chunks if needed. + */ + pkt_size -= WORD_ROUND(chunk->skb->len); + + if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) + sctp_chunk_free(chunk); + + if (!pkt_size) + break; + } + + /* SCTP-AUTH, Section 6.2 + * The sender MUST calculate the MAC as described in RFC2104 [2] + * using the hash function H as described by the MAC Identifier and + * the shared association key K based on the endpoint pair shared key + * described by the shared key identifier. The 'data' used for the + * computation of the AUTH-chunk is given by the AUTH chunk with its + * HMAC field set to zero (as shown in Figure 6) followed by all + * chunks that are placed after the AUTH chunk in the SCTP packet. */ - if (!sctp_chunk_is_data(chunk)) - sctp_chunk_free(chunk); - } + if (auth) + sctp_auth_calculate_hmac(asoc, nskb, + (struct sctp_auth_chunk *)auth, + gfp); + + if (packet->auth) { + if (!list_empty(&packet->chunk_list)) { + /* We will generate more packets, so re-queue + * auth chunk. + */ + list_add(&packet->auth->list, + &packet->chunk_list); + } else { + sctp_chunk_free(packet->auth); + packet->auth = NULL; + } + } - /* SCTP-AUTH, Section 6.2 - * The sender MUST calculate the MAC as described in RFC2104 [2] - * using the hash function H as described by the MAC Identifier and - * the shared association key K based on the endpoint pair shared key - * described by the shared key identifier. The 'data' used for the - * computation of the AUTH-chunk is given by the AUTH chunk with its - * HMAC field set to zero (as shown in Figure 6) followed by all - * chunks that are placed after the AUTH chunk in the SCTP packet. - */ - if (auth) - sctp_auth_calculate_hmac(asoc, nskb, - (struct sctp_auth_chunk *)auth, - gfp); + if (!gso) + break; + + if (skb_gro_receive(&head, nskb)) + goto nomem; + nskb = NULL; + if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= + sk->sk_gso_max_segs)) + goto nomem; + } while (!list_empty(&packet->chunk_list)); /* 2) Calculate the Adler-32 checksum of the whole packet, * including the SCTP common header and all the @@ -532,16 +635,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in . + * + * If it's a GSO packet, it's postponed to sctp_skb_segment. */ - if (!sctp_checksum_disable) { - if (!(dst->dev->features & NETIF_F_SCTP_CRC) || - (dst_xfrm(dst) != NULL) || packet->ipfragok) { - sh->checksum = sctp_compute_cksum(nskb, 0); + if (!sctp_checksum_disable || gso) { + if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) || + dst_xfrm(dst) || packet->ipfragok)) { + sh->checksum = sctp_compute_cksum(head, 0); } else { /* no need to seed pseudo checksum for SCTP */ - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = skb_transport_header(nskb) - nskb->head; - nskb->csum_offset = offsetof(struct sctphdr, checksum); + head->ip_summed = CHECKSUM_PARTIAL; + head->csum_start = skb_transport_header(head) - head->head; + head->csum_offset = offsetof(struct sctphdr, checksum); } } @@ -557,7 +662,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * Note: The works for IPv6 layer checks this bit too later * in transmission. See IP6_ECN_flow_xmit(). */ - tp->af_specific->ecn_capable(nskb->sk); + tp->af_specific->ecn_capable(sk); /* Set up the IP options. */ /* BUG: not implemented @@ -566,7 +671,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) /* Dump that on IP! */ if (asoc) { - asoc->stats.opackets++; + asoc->stats.opackets += pktcount; if (asoc->peer.last_sent_to != tp) /* Considering the multiple CPU scenario, this is a * "correcter" place for last_sent_to. --xguo @@ -589,16 +694,36 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) } } - pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len); + pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); - nskb->ignore_df = packet->ipfragok; - tp->af_specific->sctp_xmit(nskb, tp); + if (gso) { + /* Cleanup our debris for IP stacks */ + memset(head->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + + skb_shinfo(head)->gso_segs = pktcount; + skb_shinfo(head)->gso_size = GSO_BY_FRAGS; + + /* We have to refresh this in case we are xmiting to + * more than one transport at a time + */ + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + rcu_read_unlock(); + } + head->ignore_df = packet->ipfragok; + tp->af_specific->sctp_xmit(head, tp); out: sctp_packet_reset(packet); return err; no_route: - kfree_skb(nskb); + kfree_skb(head); + if (nskb != head) + kfree_skb(nskb); if (asoc) IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); @@ -623,6 +748,8 @@ err: } goto out; nomem: + if (packet->auth && list_empty(&packet->auth->list)) + sctp_chunk_free(packet->auth); err = -ENOMEM; goto err; } @@ -751,39 +878,74 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { - size_t psize; - size_t pmtu; - int too_big; + size_t psize, pmtu, maxsize; sctp_xmit_t retval = SCTP_XMIT_OK; psize = packet->size; - pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pathmtu) : - (packet->transport->pathmtu)); - - too_big = (psize + chunk_len > pmtu); + if (packet->transport->asoc) + pmtu = packet->transport->asoc->pathmtu; + else + pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ - if (too_big) { - /* It's OK to fragmet at IP level if any one of the following + if (psize + chunk_len > pmtu) { + /* It's OK to fragment at IP level if any one of the following * is true: - * 1. The packet is empty (meaning this chunk is greater - * the MTU) - * 2. The chunk we are adding is a control chunk - * 3. The packet doesn't have any data in it yet and data - * requires authentication. + * 1. The packet is empty (meaning this chunk is greater + * the MTU) + * 2. The packet doesn't have any data in it yet and data + * requires authentication. */ - if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) || + if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; - } else { - retval = SCTP_XMIT_PMTU_FULL; + goto out; } + + /* Similarly, if this chunk was built before a PMTU + * reduction, we have to fragment it at IP level now. So + * if the packet already contains something, we need to + * flush. + */ + maxsize = pmtu - packet->overhead; + if (packet->auth) + maxsize -= WORD_ROUND(packet->auth->skb->len); + if (chunk_len > maxsize) + retval = SCTP_XMIT_PMTU_FULL; + + /* It is also okay to fragment if the chunk we are + * adding is a control chunk, but only if current packet + * is not a GSO one otherwise it causes fragmentation of + * a large frame. So in this case we allow the + * fragmentation by forcing it to be in a new packet. + */ + if (!sctp_chunk_is_data(chunk) && packet->has_data) + retval = SCTP_XMIT_PMTU_FULL; + + if (psize + chunk_len > packet->max_size) + /* Hit GSO/PMTU limit, gotta flush */ + retval = SCTP_XMIT_PMTU_FULL; + + if (!packet->transport->burst_limited && + psize + chunk_len > (packet->transport->cwnd >> 1)) + /* Do not allow a single GSO packet to use more + * than half of cwnd. + */ + retval = SCTP_XMIT_PMTU_FULL; + + if (packet->transport->burst_limited && + psize + chunk_len > (packet->transport->burst_limited >> 1)) + /* Do not allow a single GSO packet to use more + * than half of original cwnd. + */ + retval = SCTP_XMIT_PMTU_FULL; + /* Otherwise it will fit in the GSO packet */ } +out: return retval; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 084718f9b..107233da5 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -326,6 +326,9 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) sctp_chunk_hold(chunk); sctp_outq_tail_data(q, chunk); + if (chunk->asoc->peer.prsctp_capable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + chunk->asoc->sent_cnt_removable++; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); else @@ -372,6 +375,96 @@ static void sctp_insert_list(struct list_head *head, struct list_head *new) list_add_tail(new, head); } +static int sctp_prsctp_prune_sent(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct list_head *queue, int msg_len) +{ + struct sctp_chunk *chk, *temp; + + list_for_each_entry_safe(chk, temp, queue, transmitted_list) { + if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) + continue; + + list_del_init(&chk->transmitted_list); + sctp_insert_list(&asoc->outqueue.abandoned, + &chk->transmitted_list); + + asoc->sent_cnt_removable--; + asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + + if (!chk->tsn_gap_acked) { + if (chk->transport) + chk->transport->flight_size -= + sctp_data_size(chk); + asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); + } + + msg_len -= SCTP_DATA_SNDSIZE(chk) + + sizeof(struct sk_buff) + + sizeof(struct sctp_chunk); + if (msg_len <= 0) + break; + } + + return msg_len; +} + +static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct list_head *queue, int msg_len) +{ + struct sctp_chunk *chk, *temp; + + list_for_each_entry_safe(chk, temp, queue, list) { + if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) + continue; + + list_del_init(&chk->list); + asoc->sent_cnt_removable--; + asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + + msg_len -= SCTP_DATA_SNDSIZE(chk) + + sizeof(struct sk_buff) + + sizeof(struct sctp_chunk); + sctp_chunk_free(chk); + if (msg_len <= 0) + break; + } + + return msg_len; +} + +/* Abandon the chunks according their priorities */ +void sctp_prsctp_prune(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, int msg_len) +{ + struct sctp_transport *transport; + + if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable) + return; + + msg_len = sctp_prsctp_prune_sent(asoc, sinfo, + &asoc->outqueue.retransmit, + msg_len); + if (msg_len <= 0) + return; + + list_for_each_entry(transport, &asoc->peer.transport_addr_list, + transports) { + msg_len = sctp_prsctp_prune_sent(asoc, sinfo, + &transport->transmitted, + msg_len); + if (msg_len <= 0) + return; + } + + sctp_prsctp_prune_unsent(asoc, sinfo, + &asoc->outqueue.out_chunk_list, + msg_len); +} + /* Mark all the eligible packets on a transport for retransmission. */ void sctp_retransmit_mark(struct sctp_outq *q, struct sctp_transport *transport, @@ -962,6 +1055,9 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) /* Mark as failed send. */ sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); + if (asoc->peer.prsctp_capable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; sctp_chunk_free(chunk); continue; } @@ -1251,6 +1347,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) tsn = ntohl(tchunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(&tchunk->transmitted_list); + if (asoc->peer.prsctp_capable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; sctp_chunk_free(tchunk); } } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d3d50daa2..7b523e3f5 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -240,6 +240,7 @@ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, port = &addr->v4.sin_port; addr->v4.sin_family = AF_INET; + /* Always called on head skb, so this is safe */ sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; @@ -1027,7 +1028,7 @@ static const struct proto_ops inet_seqpacket_ops = { .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */ .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, + .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT @@ -1479,7 +1480,8 @@ static __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } - if (sctp_transport_hashtable_init()) + status = sctp_transport_hashtable_init(); + if (status) goto err_thash_alloc; pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, @@ -1516,6 +1518,9 @@ static __init int sctp_init(void) if (status) goto err_v6_add_protocol; + if (sctp_offload_init() < 0) + pr_crit("%s: Cannot add SCTP protocol offload\n", __func__); + out: return status; err_v6_add_protocol: diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 10bae2201..cef0cee18 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -13,6 +13,7 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, { union sctp_addr laddr, paddr; struct dst_entry *dst; + struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer; laddr = list_entry(asoc->base.bind_addr.address_list.next, struct sctp_sockaddr_entry, list)->a; @@ -40,10 +41,15 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, } r->idiag_state = asoc->state; - r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; - r->idiag_retrans = asoc->rtx_data_chunks; - r->idiag_expires = jiffies_to_msecs( - asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] - jiffies); + if (timer_pending(t3_rtx)) { + r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; + r->idiag_retrans = asoc->rtx_data_chunks; + r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies); + } else { + r->idiag_timer = 0; + r->idiag_retrans = 0; + r->idiag_expires = 0; + } } static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, @@ -266,28 +272,17 @@ out: return err; } -static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) +static int sctp_sock_dump(struct sock *sk, void *p) { - struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_comm_param *commp = p; - struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc = - list_entry(ep->asocs.next, struct sctp_association, asocs); + struct sctp_association *assoc; int err = 0; - /* find the ep only once through the transports by this condition */ - if (tsp->asoc != assoc) - goto out; - - if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) - goto out; - lock_sock(sk); - if (sk != assoc->base.sk) - goto release; list_for_each_entry(assoc, &ep->asocs, asocs) { if (cb->args[4] < cb->args[1]) goto next; @@ -306,7 +301,7 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh) < 0) { cb->args[3] = 1; - err = 2; + err = 1; goto release; } cb->args[3] = 1; @@ -315,7 +310,7 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) { - err = 2; + err = 1; goto release; } next: @@ -327,10 +322,35 @@ next: cb->args[4] = 0; release: release_sock(sk); + sock_put(sk); return err; +} + +static int sctp_get_sock(struct sctp_transport *tsp, void *p) +{ + struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; + struct netlink_callback *cb = commp->cb; + const struct inet_diag_req_v2 *r = commp->r; + struct sctp_association *assoc = + list_entry(ep->asocs.next, struct sctp_association, asocs); + + /* find the ep only once through the transports by this condition */ + if (tsp->asoc != assoc) + goto out; + + if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) + goto out; + + sock_hold(sk); + cb->args[5] = (long)sk; + + return 1; + out: cb->args[2]++; - return err; + return 0; } static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) @@ -350,7 +370,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) if (cb->args[4] < cb->args[1]) goto next; - if ((r->idiag_states & ~TCPF_LISTEN) && !list_empty(&ep->asocs)) + if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs)) goto next; if (r->sdiag_family != AF_UNSPEC && @@ -466,10 +486,18 @@ skip: * 2 : to record the transport pos of this time's traversal * 3 : to mark if we have dumped the ep info of the current asoc * 4 : to work as a temporary variable to traversal list + * 5 : to save the sk we get from travelsing the tsp list. */ - if (!(idiag_states & ~TCPF_LISTEN)) + if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) goto done; - sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp); + +next: + cb->args[5] = 0; + sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp); + + if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp)) + goto next; + done: cb->args[1] = cb->args[4]; cb->args[4] = 0; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 56f364d8f..46ffecc57 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -108,14 +108,9 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) /* What was the inbound interface for this chunk? */ int sctp_chunk_iif(const struct sctp_chunk *chunk) { - struct sctp_af *af; - int iif = 0; - - af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version)); - if (af) - iif = af->skb_iif(chunk->skb); + struct sk_buff *skb = chunk->skb; - return iif; + return SCTP_INPUT_CB(skb)->af->skb_iif(skb); } /* RFC 2960 3.3.2 Initiation (INIT) (1) @@ -261,7 +256,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); chunksize += sizeof(ecap_param); - if (net->sctp.prsctp_enable) + if (asoc->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: @@ -355,7 +350,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, sctp_addto_param(retval, num_ext, extensions); } - if (net->sctp.prsctp_enable) + if (asoc->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { @@ -1585,7 +1580,6 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_association *asoc; struct sk_buff *skb; sctp_scope_t scope; - struct sctp_af *af; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); @@ -1595,16 +1589,10 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ - af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version)); - if (unlikely(!af)) - goto fail; - af->from_skb(&asoc->c.peer_addr, skb, 1); + SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); + nodata: return asoc; - -fail: - sctp_association_free(asoc); - return NULL; } /* Build a cookie representing asoc. @@ -2024,8 +2012,8 @@ static void sctp_process_ext_param(struct sctp_association *asoc, for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_FWD_TSN: - if (net->sctp.prsctp_enable && !asoc->peer.prsctp_capable) - asoc->peer.prsctp_capable = 1; + if (asoc->prsctp_enable && !asoc->peer.prsctp_capable) + asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he @@ -2169,7 +2157,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, break; case SCTP_PARAM_FWD_TSN_SUPPORT: - if (net->sctp.prsctp_enable) + if (ep->prsctp_enable) break; goto fallthrough; @@ -2653,7 +2641,7 @@ do_addr_param: break; case SCTP_PARAM_FWD_TSN_SUPPORT: - if (net->sctp.prsctp_enable) { + if (asoc->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index aa3712259..12d451933 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -806,8 +806,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */ if (sctp_state(asoc, SHUTDOWN_RECEIVED) && - sctp_sstate(sk, ESTABLISHED)) + sctp_sstate(sk, ESTABLISHED)) { + sk->sk_state = SCTP_SS_CLOSING; sk->sk_shutdown |= RCV_SHUTDOWN; + } } if (sctp_state(asoc, COOKIE_WAIT)) { diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f1f08c8f2..d88bb2b0b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6118,14 +6118,11 @@ static int sctp_eat_data(const struct sctp_association *asoc, * chunk later. */ - if (!chunk->ecn_ce_done) { - struct sctp_af *af; + if (asoc->peer.ecn_capable && !chunk->ecn_ce_done) { + struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af; chunk->ecn_ce_done = 1; - af = sctp_get_af_specific( - ipver2af(ip_hdr(chunk->skb)->version)); - - if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) { + if (af->is_ce(sctp_gso_headskb(chunk->skb))) { /* Do real work as sideffect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, SCTP_U32(tsn)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7f5689a93..8ed2d99bd 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -202,7 +202,7 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) * could be a TCP-style listening socket or a socket which * hasn't yet called connect() to establish an association. */ - if (!sctp_sstate(sk, ESTABLISHED)) + if (!sctp_sstate(sk, ESTABLISHED) && !sctp_sstate(sk, CLOSING)) return NULL; /* Get the first and the only association from the list. */ @@ -1068,7 +1068,7 @@ static int __sctp_connect(struct sock *sk, * is already connected. * It cannot be done even on a TCP-style listening socket. */ - if (sctp_sstate(sk, ESTABLISHED) || + if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) || (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) { err = -EISCONN; goto out_free; @@ -1705,18 +1705,19 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) if (msg_name) { /* Look for a matching association on the endpoint. */ asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); - if (!asoc) { - /* If we could not find a matching association on the - * endpoint, make sure that it is not a TCP-style - * socket that already has an association or there is - * no peeled-off association on another socket. - */ - if ((sctp_style(sk, TCP) && - sctp_sstate(sk, ESTABLISHED)) || - sctp_endpoint_is_peeled_off(ep, &to)) { - err = -EADDRNOTAVAIL; - goto out_unlock; - } + + /* If we could not find a matching association on the + * endpoint, make sure that it is not a TCP-style + * socket that already has an association or there is + * no peeled-off association on another socket. + */ + if (!asoc && + ((sctp_style(sk, TCP) && + (sctp_sstate(sk, ESTABLISHED) || + sctp_sstate(sk, CLOSING))) || + sctp_endpoint_is_peeled_off(ep, &to))) { + err = -EADDRNOTAVAIL; + goto out_unlock; } } else { asoc = sctp_id2assoc(sk, associd); @@ -1914,6 +1915,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_free; } + if (sctp_wspace(asoc) < msg_len) + sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); @@ -2063,7 +2067,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, { struct sctp_ulpevent *event = NULL; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *head_skb; int copied; int err = 0; int skb_len; @@ -2074,7 +2078,8 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, lock_sock(sk); - if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) { + if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED) && + !sctp_sstate(sk, CLOSING) && !sctp_sstate(sk, CLOSED)) { err = -ENOTCONN; goto out; } @@ -2099,12 +2104,16 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (err) goto out_free; - sock_recv_ts_and_drops(msg, sk, skb); + if (event->chunk && event->chunk->head_skb) + head_skb = event->chunk->head_skb; + else + head_skb = skb; + sock_recv_ts_and_drops(msg, sk, head_skb); if (sctp_ulpevent_is_notification(event)) { msg->msg_flags |= MSG_NOTIFICATION; sp->pf->event_msgname(event, msg->msg_name, addr_len); } else { - sp->pf->skb_msgname(skb, msg->msg_name, addr_len); + sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len); } /* Check if we allow SCTP_NXTINFO. */ @@ -3661,6 +3670,80 @@ static int sctp_setsockopt_recvnxtinfo(struct sock *sk, return 0; } +static int sctp_setsockopt_pr_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + asoc->prsctp_enable = !!params.assoc_value; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + sp->ep->prsctp_enable = !!params.assoc_value; + } else { + goto out; + } + + retval = 0; + +out: + return retval; +} + +static int sctp_setsockopt_default_prinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_default_prinfo info; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(info)) + goto out; + + if (copy_from_user(&info, optval, sizeof(info))) { + retval = -EFAULT; + goto out; + } + + if (info.pr_policy & ~SCTP_PR_SCTP_MASK) + goto out; + + if (info.pr_policy == SCTP_PR_SCTP_NONE) + info.pr_value = 0; + + asoc = sctp_id2assoc(sk, info.pr_assoc_id); + if (asoc) { + SCTP_PR_SET_POLICY(asoc->default_flags, info.pr_policy); + asoc->default_timetolive = info.pr_value; + } else if (!info.pr_assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + SCTP_PR_SET_POLICY(sp->default_flags, info.pr_policy); + sp->default_timetolive = info.pr_value; + } else { + goto out; + } + + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3821,6 +3904,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_RECVNXTINFO: retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen); break; + case SCTP_PR_SUPPORTED: + retval = sctp_setsockopt_pr_supported(sk, optval, optlen); + break; + case SCTP_DEFAULT_PRINFO: + retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4003,6 +4092,8 @@ static int sctp_init_sock(struct sock *sk) return -ESOCKTNOSUPPORT; } + sk->sk_gso_type = SKB_GSO_SCTP; + /* Initialize default send parameters. These parameters can be * modified with the SCTP_DEFAULT_SEND_PARAM socket option. */ @@ -4193,6 +4284,7 @@ static void sctp_shutdown(struct sock *sk, int how) return; if (how & SEND_SHUTDOWN) { + sk->sk_state = SCTP_SS_CLOSING; ep = sctp_sk(sk)->ep; if (!list_empty(&ep->asocs)) { asoc = list_entry(ep->asocs.next, @@ -4377,17 +4469,21 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), const union sctp_addr *paddr, void *p) { struct sctp_transport *transport; - int err = 0; + int err = -ENOENT; rcu_read_lock(); transport = sctp_addrs_lookup_transport(net, laddr, paddr); if (!transport || !sctp_transport_hold(transport)) goto out; - err = cb(transport, p); + + sctp_association_hold(transport->asoc); sctp_transport_put(transport); -out: rcu_read_unlock(); + err = cb(transport, p); + sctp_association_put(transport->asoc); + +out: return err; } EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); @@ -6164,6 +6260,148 @@ static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_pr_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->prsctp_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->ep->prsctp_enable; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + +static int sctp_getsockopt_default_prinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_default_prinfo info; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(info)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(info); + if (copy_from_user(&info, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, info.pr_assoc_id); + if (asoc) { + info.pr_policy = SCTP_PR_POLICY(asoc->default_flags); + info.pr_value = asoc->default_timetolive; + } else if (!info.pr_assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + info.pr_policy = SCTP_PR_POLICY(sp->default_flags); + info.pr_value = sp->default_timetolive; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, &info, len)) + goto out; + + retval = 0; + +out: + return retval; +} + +static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_prstatus params; + struct sctp_association *asoc; + int policy; + int retval = -EINVAL; + + if (len < sizeof(params)) + goto out; + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) { + retval = -EFAULT; + goto out; + } + + policy = params.sprstat_policy; + if (policy & ~SCTP_PR_SCTP_MASK) + goto out; + + asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); + if (!asoc) + goto out; + + if (policy == SCTP_PR_SCTP_NONE) { + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { + params.sprstat_abandoned_unsent += + asoc->abandoned_unsent[policy]; + params.sprstat_abandoned_sent += + asoc->abandoned_sent[policy]; + } + } else { + params.sprstat_abandoned_unsent = + asoc->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + params.sprstat_abandoned_sent = + asoc->abandoned_sent[__SCTP_PR_INDEX(policy)]; + } + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6317,6 +6555,17 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_RECVNXTINFO: retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen); break; + case SCTP_PR_SUPPORTED: + retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen); + break; + case SCTP_DEFAULT_PRINFO: + retval = sctp_getsockopt_default_prinfo(sk, len, optval, + optlen); + break; + case SCTP_PR_ASSOC_STATUS: + retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6864,7 +7113,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | + SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -6888,7 +7137,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | + SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -7565,10 +7814,13 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, /* If the association on the newsk is already closed before accept() * is called, set RCV_SHUTDOWN flag. */ - if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) + if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) { + newsk->sk_state = SCTP_SS_CLOSED; newsk->sk_shutdown |= RCV_SHUTDOWN; + } else { + newsk->sk_state = SCTP_SS_ESTABLISHED; + } - newsk->sk_state = SCTP_SS_ESTABLISHED; release_sock(newsk); } diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index d1e38308f..d85b803da 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -51,7 +51,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); /* Initialize an ULP event from an given skb. */ static void sctp_ulpevent_init(struct sctp_ulpevent *event, - int msg_flags, + __u16 msg_flags, unsigned int len) { memset(event, 0, sizeof(struct sctp_ulpevent)); @@ -60,7 +60,7 @@ static void sctp_ulpevent_init(struct sctp_ulpevent *event, } /* Create a new sctp_ulpevent. */ -static struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, +static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags, gfp_t gfp) { struct sctp_ulpevent *event; @@ -91,6 +91,7 @@ int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event) static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, const struct sctp_association *asoc) { + struct sctp_chunk *chunk = event->chunk; struct sk_buff *skb; /* Cast away the const, as we are just wanting to @@ -101,6 +102,8 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, event->asoc = (struct sctp_association *)asoc; atomic_add(event->rmem_len, &event->asoc->rmem_alloc); sctp_skb_set_owner_r(skb, asoc->base.sk); + if (chunk && chunk->head_skb && !chunk->head_skb->sk) + chunk->head_skb->sk = asoc->base.sk; } /* A simple destructor to give up the reference to the association. */ @@ -699,6 +702,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, */ sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff)); + /* And hold the chunk as we need it for getting the IP headers + * later in recvmsg + */ + sctp_chunk_hold(chunk); + event->chunk = chunk; + sctp_ulpevent_receive_data(event, asoc); event->stream = ntohs(chunk->subh.data_hdr->stream); @@ -710,11 +719,11 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, } event->tsn = ntohl(chunk->subh.data_hdr->tsn); event->msg_flags |= chunk->chunk_hdr->flags; - event->iif = sctp_chunk_iif(chunk); return event; fail_mark: + sctp_chunk_put(chunk); kfree_skb(skb); fail: return NULL; @@ -1007,6 +1016,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) done: sctp_assoc_rwnd_increase(event->asoc, len); + sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } @@ -1029,6 +1039,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) } done: + sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index ec166d2bd..877e55066 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -204,7 +204,9 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) /* If the socket is just going to throw this away, do not * even try to deliver it. */ - if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) + if (sk->sk_shutdown & RCV_SHUTDOWN && + (sk->sk_shutdown & SEND_SHUTDOWN || + !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 040ff627c..a7e42f9a4 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) ret = kstrtoul(val, 0, &num); if (ret == -EINVAL) goto out_inval; - nbits = fls(num); - if (num > (1U << nbits)) - nbits++; + nbits = fls(num - 1); if (nbits > MAX_HASHTABLE_BITS || nbits < 2) goto out_inval; *(unsigned int *)kp->arg = nbits; @@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); bool -rpcauth_cred_key_to_expire(struct rpc_cred *cred) +rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) { + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) + return false; if (!cred->cr_ops->crkey_to_expire) return false; return cred->cr_ops->crkey_to_expire(cred); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 54dd3fdea..168219535 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) /* Fast track for non crkey_timeout (no key) underlying credentials */ - if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) return 0; /* Fast track for the normal case */ @@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) if (IS_ERR(tcred)) return -EACCES; - if (!tcred->cr_ops->crkey_timeout) { - set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags); - ret = 0; - goto out_put; - } - /* Test for the almost error case */ ret = tcred->cr_ops->crkey_timeout(tcred); if (ret != 0) { @@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); } -out_put: put_rpccred(tcred); return ret; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index bf4b0e98f..976c7812b 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1017,8 +1017,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_flags = 0; auth->au_ops = &authgss_ops; auth->au_flavor = flavor; + if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) + auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; atomic_set(&auth->au_count, 1); kref_init(&gss_auth->kref); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 65427492b..605958353 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = { .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_INTEGRITY, .name = "krb5i", + .datatouch = true, }, [2] = { .pseudoflavor = RPC_AUTH_GSS_KRB5P, .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_PRIVACY, .name = "krb5p", + .datatouch = true, }, }; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 7063d856a..5fec3abbe 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) } EXPORT_SYMBOL(gss_pseudoflavor_to_service); +bool +gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return gm->gm_pfs[i].datatouch; + } + return false; +} + char * gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) { diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 4605dc73d..d8582028b 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1231,8 +1231,9 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, if (status) goto out; - dprintk("RPC: svcauth_gss: gss major status = %d\n", - ud.major_status); + dprintk("RPC: svcauth_gss: gss major status = %d " + "minor status = %d\n", + ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 8d9eb4d5d..4d17376b2 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -115,6 +115,7 @@ static struct rpc_auth null_auth = { .au_cslack = NUL_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 9f65452b7..a99278c98 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -228,6 +228,7 @@ static struct rpc_auth unix_auth = { .au_cslack = UNX_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 553bf95f7..4d8e11f94 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -362,7 +362,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) cache_purge(cd); spin_lock(&cache_list_lock); write_lock(&cd->hash_lock); - if (cd->entries || atomic_read(&cd->inuse)) { + if (cd->entries) { write_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); goto out; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2808d550d..66f23b376 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -453,7 +453,7 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, struct rpc_xprt_switch *xps; if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) { - WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP); + WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC)); xps = args->bc_xprt->xpt_bc_xps; xprt_switch_get(xps); } else { @@ -520,7 +520,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) char servername[48]; if (args->bc_xprt) { - WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP); + WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC)); xprt = args->bc_xprt->xpt_bc_xprt; if (xprt) { xprt_get(xprt); @@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata) kfree(data); } -const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { +static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { .rpc_call_done = rpc_cb_add_xprt_done, .rpc_release = rpc_cb_add_xprt_release, }; @@ -2638,6 +2638,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; + unsigned long reconnect_timeout; unsigned char resvport; int ret = 0; @@ -2649,6 +2650,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, return -EAGAIN; } resvport = xprt->resvport; + reconnect_timeout = xprt->max_reconnect_timeout; rcu_read_unlock(); xprt = xprt_create_transport(xprtargs); @@ -2657,6 +2659,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, goto out_put_switch; } xprt->resvport = resvport; + xprt->max_reconnect_timeout = reconnect_timeout; rpc_xprt_switch_set_roundrobin(xps); if (setup) { @@ -2673,6 +2676,27 @@ out_put_switch: } EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); +static int +rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + void *data) +{ + unsigned long timeout = *((unsigned long *)data); + + if (timeout < xprt->max_reconnect_timeout) + xprt->max_reconnect_timeout = timeout; + return 0; +} + +void +rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) +{ + rpc_clnt_iterate_for_each_xprt(clnt, + rpc_xprt_cap_max_reconnect_timeout, + &timeo); +} +EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index fc48eca21..84f98cbe3 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1386,7 +1386,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root, *gssd_dentry; - struct net *net = data; + struct net *net = get_net(sb->s_fs_info); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1419,7 +1419,6 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb); if (err) goto err_depopulate; - sb->s_fs_info = get_net(net); mutex_unlock(&sn->pipefs_sb_lock); return 0; @@ -1448,7 +1447,8 @@ static struct dentry * rpc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super); } static void rpc_kill_sb(struct super_block *sb) @@ -1468,9 +1468,9 @@ static void rpc_kill_sb(struct super_block *sb) RPC_PIPEFS_UMOUNT, sb); mutex_unlock(&sn->pipefs_sb_lock); - put_net(net); out: kill_litter_super(sb); + put_net(net); } static struct file_system_type rpc_pipe_fs_type = { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fcfd48d26..9ae588511 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue; /* * rpciod-related stuff */ -struct workqueue_struct *rpciod_workqueue; +struct workqueue_struct *rpciod_workqueue __read_mostly; +struct workqueue_struct *xprtiod_workqueue __read_mostly; /* * Disable the timer for a given RPC task. Should be called with @@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); * lockless RPC_IS_QUEUED() test) before we've had a chance to test * the RPC_TASK_RUNNING flag. */ -static void rpc_make_runnable(struct rpc_task *task) +static void rpc_make_runnable(struct workqueue_struct *wq, + struct rpc_task *task) { bool need_wakeup = !rpc_test_and_set_running(task); @@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task) return; if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); - queue_work(rpciod_workqueue, &task->u.tk_work); + queue_work(wq, &task->u.tk_work); } else wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); } @@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); /** - * __rpc_do_wake_up_task - wake up a single rpc_task + * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task + * @wq: workqueue on which to run task * @queue: wait queue * @task: task to be woken up * * Caller must hold queue->lock, and have cleared the task queued flag. */ -static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) +static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, + struct rpc_task *task) { dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", task->tk_pid, jiffies); @@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task __rpc_remove_wait_queue(queue, task); - rpc_make_runnable(task); + rpc_make_runnable(wq, task); dprintk("RPC: __rpc_wake_up_task done\n"); } @@ -436,15 +441,24 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task) { if (RPC_IS_QUEUED(task)) { smp_rmb(); if (task->tk_waitqueue == queue) - __rpc_do_wake_up_task(queue, task); + __rpc_do_wake_up_task_on_wq(wq, queue, task); } } +/* + * Wake up a queued task while the queue lock is being held + */ +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); +} + /* * Wake up a task on a specific queue */ @@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue) /* * Wake up the first task on the wait queue. */ -struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, +struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, bool (*func)(struct rpc_task *, void *), void *data) { struct rpc_task *task = NULL; @@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, task = __rpc_find_next_queued(queue); if (task != NULL) { if (func(task, data)) - rpc_wake_up_task_queue_locked(queue, task); + rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); else task = NULL; } @@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, return task; } + +/* + * Wake up the first task on the wait queue. + */ +struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, + bool (*func)(struct rpc_task *, void *), void *data) +{ + return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data); +} EXPORT_SYMBOL_GPL(rpc_wake_up_first); static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) @@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task) bool is_async = RPC_IS_ASYNC(task); rpc_set_active(task); - rpc_make_runnable(task); + rpc_make_runnable(rpciod_workqueue, task); if (!is_async) __rpc_execute(task); } @@ -1071,10 +1095,22 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - /* Note: highpri because network receive is latency sensitive */ - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); + if (!wq) + goto out_failed; rpciod_workqueue = wq; - return rpciod_workqueue != NULL; + /* Note: highpri because network receive is latency sensitive */ + wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!wq) + goto free_rpciod; + xprtiod_workqueue = wq; + return 1; +free_rpciod: + wq = rpciod_workqueue; + rpciod_workqueue = NULL; + destroy_workqueue(wq); +out_failed: + return 0; } static void rpciod_stop(void) @@ -1088,6 +1124,9 @@ static void rpciod_stop(void) wq = rpciod_workqueue; rpciod_workqueue = NULL; destroy_workqueue(wq); + wq = xprtiod_workqueue; + xprtiod_workqueue = NULL; + destroy_workqueue(wq); } void diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index cc9852897..c5b0cb4f4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); /* Encode reply */ - if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { + if (*statp == rpc_drop_reply || + test_bit(RQ_DROPME, &rqstp->rq_flags)) { if (procp->pc_release) procp->pc_release(rqstp, NULL, rqstp->rq_resp); goto dropit; } + if (*statp == rpc_autherr_badcred) { + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + goto err_bad_auth; + } if (*statp == rpc_success && (xdr = procp->pc_encode) && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 4f01f6310..c3f652395 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -21,6 +21,10 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static unsigned int svc_rpc_per_connection_limit __read_mostly; +module_param(svc_rpc_per_connection_limit, uint, 0644); + + static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); @@ -329,12 +333,45 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) } EXPORT_SYMBOL_GPL(svc_print_addr); +static bool svc_xprt_slots_in_range(struct svc_xprt *xprt) +{ + unsigned int limit = svc_rpc_per_connection_limit; + int nrqsts = atomic_read(&xprt->xpt_nr_rqsts); + + return limit == 0 || (nrqsts >= 0 && nrqsts < limit); +} + +static bool svc_xprt_reserve_slot(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + if (!test_bit(RQ_DATA, &rqstp->rq_flags)) { + if (!svc_xprt_slots_in_range(xprt)) + return false; + atomic_inc(&xprt->xpt_nr_rqsts); + set_bit(RQ_DATA, &rqstp->rq_flags); + } + return true; +} + +static void svc_xprt_release_slot(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) { + atomic_dec(&xprt->xpt_nr_rqsts); + svc_xprt_enqueue(xprt); + } +} + static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) { if (xprt->xpt_flags & ((1<xpt_flags & ((1<xpt_ops->xpo_has_wspace(xprt); + if (xprt->xpt_flags & ((1<xpt_ops->xpo_has_wspace(xprt) && + svc_xprt_slots_in_range(xprt)) + return true; + trace_svc_xprt_no_write_space(xprt); + return false; + } return false; } @@ -480,8 +517,6 @@ void svc_reserve(struct svc_rqst *rqstp, int space) atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; - if (xprt->xpt_ops->xpo_adjust_wspace) - xprt->xpt_ops->xpo_adjust_wspace(xprt); svc_xprt_enqueue(xprt); } } @@ -512,8 +547,8 @@ static void svc_xprt_release(struct svc_rqst *rqstp) rqstp->rq_res.head[0].iov_len = 0; svc_reserve(rqstp, 0); + svc_xprt_release_slot(rqstp); rqstp->rq_xprt = NULL; - svc_xprt_put(xprt); } @@ -781,7 +816,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) svc_add_new_temp_xprt(serv, newxpt); else module_put(xprt->xpt_class->xcl_owner); - } else { + } else if (svc_xprt_reserve_slot(rqstp, xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", rqstp, rqstp->rq_pool->sp_id, xprt, @@ -871,6 +906,7 @@ EXPORT_SYMBOL_GPL(svc_recv); */ void svc_drop(struct svc_rqst *rqstp) { + trace_svc_drop(rqstp); dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); svc_xprt_release(rqstp); } @@ -1148,6 +1184,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) spin_unlock(&xprt->xpt_lock); dprintk("revisit canceled\n"); svc_xprt_put(xprt); + trace_svc_drop_deferred(dr); kfree(dr); return; } @@ -1205,6 +1242,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; + trace_svc_defer(rqstp); return &dr->handle; } @@ -1245,6 +1283,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) struct svc_deferred_req, handle.recent); list_del_init(&dr->handle.recent); + trace_svc_revisit_deferred(dr); } else clear_bit(XPT_DEFERRED, &xprt->xpt_flags); spin_unlock(&xprt->xpt_lock); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index dadfec66d..57625f64e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -60,7 +60,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int flags); -static void svc_udp_data_ready(struct sock *); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); static void svc_sock_detach(struct svc_xprt *); @@ -398,48 +397,21 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp) return svc_port_is_privileged(svc_addr(rqstp)); } -static bool sunrpc_waitqueue_active(wait_queue_head_t *wq) -{ - if (!wq) - return false; - /* - * There should normally be a memory * barrier here--see - * wq_has_sleeper(). - * - * It appears that isn't currently necessary, though, basically - * because callers all appear to have sufficient memory barriers - * between the time the relevant change is made and the - * time they call these callbacks. - * - * The nfsd code itself doesn't actually explicitly wait on - * these waitqueues, but it may wait on them for example in - * sendpage() or sendmsg() calls. (And those may be the only - * places, since it it uses nonblocking reads.) - * - * Maybe we should add the memory barriers anyway, but these are - * hot paths so we'd need to be convinced there's no sigificant - * penalty. - */ - return waitqueue_active(wq); -} - /* * INET callback when data has been received on the socket. */ -static void svc_udp_data_ready(struct sock *sk) +static void svc_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); if (svsk) { dprintk("svc: socket %p(inet %p), busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); + svsk->sk_odata(sk); + if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) + svc_xprt_enqueue(&svsk->sk_xprt); } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); } /* @@ -448,56 +420,22 @@ static void svc_udp_data_ready(struct sock *sk) static void svc_write_space(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); - wait_queue_head_t *wq = sk_sleep(sk); if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + svsk->sk_owspace(sk); svc_xprt_enqueue(&svsk->sk_xprt); } - - if (sunrpc_waitqueue_active(wq)) { - dprintk("RPC svc_write_space: someone sleeping on %p\n", - svsk); - wake_up_interruptible(wq); - } } static int svc_tcp_has_wspace(struct svc_xprt *xprt) { - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int required; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) return 1; - required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg; - if (sk_stream_wspace(svsk->sk_sk) >= required || - (sk_stream_min_wspace(svsk->sk_sk) == 0 && - atomic_read(&xprt->xpt_reserved) == 0)) - return 1; - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - return 0; -} - -static void svc_tcp_write_space(struct sock *sk) -{ - struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); - struct socket *sock = sk->sk_socket; - - if (!sk_stream_is_writeable(sk) || !sock) - return; - if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt)) - clear_bit(SOCK_NOSPACE, &sock->flags); - svc_write_space(sk); -} - -static void svc_tcp_adjust_wspace(struct svc_xprt *xprt) -{ - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - - if (svc_tcp_has_wspace(xprt)) - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); } /* @@ -746,7 +684,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); - svsk->sk_sk->sk_data_ready = svc_udp_data_ready; + svsk->sk_sk->sk_data_ready = svc_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; /* initialise setting must have enough space to @@ -786,11 +724,12 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq; dprintk("svc: socket %p TCP (listen) state change %d\n", sk, sk->sk_state); + if (svsk) + svsk->sk_odata(sk); /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -808,10 +747,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) } else printk("svc: socket %p: no user data\n", sk); } - - wq = sk_sleep(sk); - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible_all(wq); } /* @@ -820,7 +755,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) static void svc_tcp_state_change(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", sk, sk->sk_state, sk->sk_user_data); @@ -828,26 +762,12 @@ static void svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible_all(wq); -} - -static void svc_tcp_data_ready(struct sock *sk) -{ - struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); - - dprintk("svc: socket %p TCP data ready (svsk %p)\n", - sk, sk->sk_user_data); - if (svsk) { - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); + svsk->sk_ostate(sk); + if (sk->sk_state != TCP_ESTABLISHED) { + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); } /* @@ -901,6 +821,11 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) dprintk("%s: connect from %s\n", serv->sv_name, __svc_print_addr(sin, buf, sizeof(buf))); + /* Reset the inherited callbacks before calling svc_setup_socket */ + newsock->sk->sk_state_change = svsk->sk_ostate; + newsock->sk->sk_data_ready = svsk->sk_odata; + newsock->sk->sk_write_space = svsk->sk_owspace; + /* make sure that a write doesn't block forever when * low on memory */ @@ -1317,7 +1242,6 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, - .xpo_adjust_wspace = svc_tcp_adjust_wspace, }; static struct svc_xprt_class svc_tcp_class = { @@ -1357,8 +1281,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) } else { dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; - sk->sk_data_ready = svc_tcp_data_ready; - sk->sk_write_space = svc_tcp_write_space; + sk->sk_data_ready = svc_data_ready; + sk->sk_write_space = svc_write_space; svsk->sk_reclen = 0; svsk->sk_tcplen = 0; @@ -1368,8 +1292,13 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - if (sk->sk_state != TCP_ESTABLISHED) + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + break; + default: set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + } } } @@ -1428,17 +1357,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Initialize the socket */ if (sock->type == SOCK_DGRAM) svc_udp_init(svsk, serv); - else { - /* initialise setting must have enough space to - * receive and respond to one request. - */ - svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, - 4 * serv->sv_max_mesg); + else svc_tcp_init(svsk, serv); - } - dprintk("svc: svc_setup_socket created %p (inet %p)\n", - svsk, svsk->sk_sk); + dprintk("svc: svc_setup_socket created %p (inet %p), " + "listen %d close %d\n", + svsk, svsk->sk_sk, + test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); return svsk; } @@ -1606,18 +1532,16 @@ static void svc_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sock *sk = svsk->sk_sk; - wait_queue_head_t *wq; dprintk("svc: svc_sock_detach(%p)\n", svsk); /* put back the old socket callbacks */ + lock_sock(sk); sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; sk->sk_write_space = svsk->sk_owspace; - - wq = sk_sleep(sk); - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); + sk->sk_user_data = NULL; + release_sock(sk); } /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 216a13857..ea244b291 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_atomic(); } else - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); } /* @@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, + __xprt_lock_write_func, xprt)) return; xprt_clear_locked(xprt); } @@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) return; if (RPCXPRT_CONGESTED(xprt)) goto out_unlock; - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, + __xprt_lock_write_cong_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); @@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -672,12 +674,26 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); xprt_wake_pending_tasks(xprt, -EAGAIN); out: spin_unlock_bh(&xprt->transport_lock); } +static bool +xprt_has_timer(const struct rpc_xprt *xprt) +{ + return xprt->idle_timeout != 0; +} + +static void +xprt_schedule_autodisconnect(struct rpc_xprt *xprt) + __must_hold(&xprt->transport_lock) +{ + if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) + mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); +} + static void xprt_init_autodisconnect(unsigned long data) { @@ -686,10 +702,12 @@ xprt_init_autodisconnect(unsigned long data) spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv)) goto out_abort; + /* Reset xprt->last_used to avoid connect/autodisconnect cycling */ + xprt->last_used = jiffies; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); return; out_abort: spin_unlock(&xprt->transport_lock); @@ -723,6 +741,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) goto out; xprt->snd_task =NULL; xprt->ops->release_xprt(xprt, NULL); + xprt_schedule_autodisconnect(xprt); out: spin_unlock_bh(&xprt->transport_lock); wake_up_bit(&xprt->state, XPRT_LOCKED); @@ -886,11 +905,6 @@ static void xprt_timer(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); } -static inline int xprt_has_timer(struct rpc_xprt *xprt) -{ - return xprt->idle_timeout != 0; -} - /** * xprt_prepare_transmit - reserve the transport before sending a request * @task: RPC task about to send a request @@ -1278,9 +1292,7 @@ void xprt_release(struct rpc_task *task) if (!list_empty(&req->rq_list)) list_del(&req->rq_list); xprt->last_used = jiffies; - if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) - mod_timer(&xprt->timer, - xprt->last_used + xprt->idle_timeout); + xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(req->rq_buffer); diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index dc9f3b513..ef19fa42c 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ - fmr_ops.o frwr_ops.o physical_ops.o \ + fmr_ops.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 6326ebe8b..21cb3b150 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -19,13 +19,6 @@ * verb (fmr_op_unmap). */ -/* Transport recovery - * - * After a transport reconnect, fmr_op_map re-uses the MR already - * allocated for the RPC, but generates a fresh rkey then maps the - * MR again. This process is synchronous. - */ - #include "xprt_rdma.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -35,62 +28,132 @@ /* Maximum scatter/gather per FMR */ #define RPCRDMA_MAX_FMR_SGES (64) -static struct workqueue_struct *fmr_recovery_wq; - -#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) +/* Access mode of externally registered pages */ +enum { + RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ, +}; -int -fmr_alloc_recovery_wq(void) +bool +fmr_is_supported(struct rpcrdma_ia *ia) { - fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); - return !fmr_recovery_wq ? -ENOMEM : 0; + if (!ia->ri_device->alloc_fmr) { + pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", + ia->ri_device->name); + return false; + } + return true; } -void -fmr_destroy_recovery_wq(void) +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) { - struct workqueue_struct *wq; + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; - if (!fmr_recovery_wq) - return; + mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mw->fmr.fm_physaddrs) + goto out_free; - wq = fmr_recovery_wq; - fmr_recovery_wq = NULL; - destroy_workqueue(wq); + mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mw->mw_sg), GFP_KERNEL); + if (!mw->mw_sg) + goto out_free; + + sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); + + mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mw->fmr.fm_mr)) + goto out_fmr_err; + + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mw->fmr.fm_mr)); + +out_free: + kfree(mw->mw_sg); + kfree(mw->fmr.fm_physaddrs); + return -ENOMEM; } static int __fmr_unmap(struct rpcrdma_mw *mw) { LIST_HEAD(l); + int rc; - list_add(&mw->fmr.fmr->list, &l); - return ib_unmap_fmr(&l); + list_add(&mw->fmr.fm_mr->list, &l); + rc = ib_unmap_fmr(&l); + list_del_init(&mw->fmr.fm_mr->list); + return rc; } -/* Deferred reset of a single FMR. Generate a fresh rkey by - * replacing the MR. There's no recovery if this fails. - */ static void -__fmr_recovery_worker(struct work_struct *work) +fmr_op_release_mr(struct rpcrdma_mw *r) { - struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, - mw_work); - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + LIST_HEAD(unmap_list); + int rc; - __fmr_unmap(mw); - rpcrdma_put_mw(r_xprt, mw); - return; + /* Ensure MW is not on any rl_registered list */ + if (!list_empty(&r->mw_list)) + list_del(&r->mw_list); + + kfree(r->fmr.fm_physaddrs); + kfree(r->mw_sg); + + /* In case this one was left mapped, try to unmap it + * to prevent dealloc_fmr from failing with EBUSY + */ + rc = __fmr_unmap(r); + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + r, rc); + + rc = ib_dealloc_fmr(r->fmr.fm_mr); + if (rc) + pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", + r, rc); + + kfree(r); } -/* A broken MR was discovered in a context that can't sleep. - * Defer recovery to the recovery worker. +/* Reset of a single FMR. */ static void -__fmr_queue_recovery(struct rpcrdma_mw *mw) +fmr_op_recover_mr(struct rpcrdma_mw *mw) { - INIT_WORK(&mw->mw_work, __fmr_recovery_worker); - queue_work(fmr_recovery_wq, &mw->mw_work); + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + int rc; + + /* ORDER: invalidate first */ + rc = __fmr_unmap(mw); + + /* ORDER: then DMA unmap */ + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (rc) + goto out_release; + + rpcrdma_put_mw(r_xprt, mw); + r_xprt->rx_stats.mrs_recovered++; + return; + +out_release: + pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); + r_xprt->rx_stats.mrs_orphaned++; + + spin_lock(&r_xprt->rx_buf.rb_mwlock); + list_del(&mw->mw_all); + spin_unlock(&r_xprt->rx_buf.rb_mwlock); + + fmr_op_release_mr(mw); } static int @@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); } -static int -fmr_op_init(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - struct rpcrdma_mw *r; - int i, rc; - - spin_lock_init(&buf->rb_mwlock); - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - - i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1); - i += 2; /* head + tail */ - i *= buf->rb_max_requests; /* one set for each RPC slot */ - dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); - - rc = -ENOMEM; - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (!r) - goto out; - - r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * - sizeof(u64), GFP_KERNEL); - if (!r->fmr.physaddrs) - goto out_free; - - r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->fmr.fmr)) - goto out_fmr_err; - - r->mw_xprt = r_xprt; - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_fmr_err: - rc = PTR_ERR(r->fmr.fmr); - dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); - kfree(r->fmr.physaddrs); -out_free: - kfree(r); -out: - return rc; -} - /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ static int fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing) + int nsegs, bool writing, struct rpcrdma_mw **out) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct ib_device *device = ia->ri_device; - enum dma_data_direction direction = rpcrdma_data_dir(writing); struct rpcrdma_mr_seg *seg1 = seg; int len, pageoff, i, rc; struct rpcrdma_mw *mw; + u64 *dma_pages; - mw = seg1->rl_mw; - seg1->rl_mw = NULL; - if (!mw) { - mw = rpcrdma_get_mw(r_xprt); - if (!mw) - return -ENOMEM; - } else { - /* this is a retransmit; generate a fresh rkey */ - rc = __fmr_unmap(mw); - if (rc) - return rc; - } + mw = rpcrdma_get_mw(r_xprt); + if (!mw) + return -ENOBUFS; pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (nsegs > RPCRDMA_MAX_FMR_SGES) nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { - rpcrdma_map_one(device, seg, direction); - mw->fmr.physaddrs[i] = seg->mr_dma; + if (seg->mr_page) + sg_set_page(&mw->mw_sg[i], + seg->mr_page, + seg->mr_len, + offset_in_page(seg->mr_offset)); + else + sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + seg->mr_len); len += seg->mr_len; ++seg; ++i; @@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - - rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, - i, seg1->mr_dma); + mw->mw_nents = i; + mw->mw_dir = rpcrdma_data_dir(writing); + if (i == 0) + goto out_dmamap_err; + + if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir)) + goto out_dmamap_err; + + for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) + dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); + rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, + dma_pages[0]); if (rc) goto out_maperr; - seg1->rl_mw = mw; - seg1->mr_rkey = mw->fmr.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - return i; + mw->mw_handle = mw->fmr.fm_mr->rkey; + mw->mw_length = len; + mw->mw_offset = dma_pages[0] + pageoff; -out_maperr: - dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", - __func__, len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(device, --seg); - return rc; -} + *out = mw; + return mw->mw_nents; -static void -__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) -{ - struct ib_device *device = r_xprt->rx_ia.ri_device; - int nsegs = seg->mr_nsegs; +out_dmamap_err: + pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", + mw->mw_sg, mw->mw_nents); + rpcrdma_defer_mr_recovery(mw); + return -EIO; - while (nsegs--) - rpcrdma_unmap_one(device, seg++); +out_maperr: + pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + len, (unsigned long long)dma_pages[0], + pageoff, mw->mw_nents, rc); + rpcrdma_defer_mr_recovery(mw); + return -EIO; } /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. + * + * Caller ensures that req->rl_registered is not empty. */ static void fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct rpcrdma_mr_seg *seg; - unsigned int i, nchunks; - struct rpcrdma_mw *mw; + struct rpcrdma_mw *mw, *tmp; LIST_HEAD(unmap_list); int rc; @@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* ORDER: Invalidate all of the req's MRs first * * ib_unmap_fmr() is slow, so use a single call instead - * of one call per mapped MR. + * of one call per mapped FMR. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - - list_add(&mw->fmr.fmr->list, &unmap_list); - - i += seg->mr_nsegs; - } + list_for_each_entry(mw, &req->rl_registered, mw_list) + list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); rc = ib_unmap_fmr(&unmap_list); if (rc) - pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); + goto out_reset; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->mw_list); + list_del_init(&mw->fmr.fm_mr->list); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + rpcrdma_put_mw(r_xprt, mw); + } - __fmr_dma_unmap(r_xprt, seg); - rpcrdma_put_mw(r_xprt, seg->rl_mw); + return; - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } +out_reset: + pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - req->rl_nchunks = 0; + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->fmr.fm_mr->list); + fmr_op_recover_mr(mw); + } } /* Use a slow, safe mechanism to invalidate all memory regions * that were registered for "req". - * - * In the asynchronous case, DMA unmapping occurs first here - * because the rpcrdma_mr_seg is released immediately after this - * call. It's contents won't be available in __fmr_dma_unmap later. - * FIXME. */ static void fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, bool sync) { - struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - unsigned int i; - - for (i = 0; req->rl_nchunks; req->rl_nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - - if (sync) { - /* ORDER */ - __fmr_unmap(mw); - __fmr_dma_unmap(r_xprt, seg); - rpcrdma_put_mw(r_xprt, mw); - } else { - __fmr_dma_unmap(r_xprt, seg); - __fmr_queue_recovery(mw); - } - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } -} - -static void -fmr_op_destroy(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - kfree(r->fmr.physaddrs); - rc = ib_dealloc_fmr(r->fmr.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); + while (!list_empty(&req->rl_registered)) { + mw = list_first_entry(&req->rl_registered, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); - kfree(r); + if (sync) + fmr_op_recover_mr(mw); + else + rpcrdma_defer_mr_recovery(mw); } } @@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap_safe = fmr_op_unmap_safe, + .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, - .ro_init = fmr_op_init, - .ro_destroy = fmr_op_destroy, + .ro_init_mr = fmr_op_init_mr, + .ro_release_mr = fmr_op_release_mr, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index f02ab80aa..892b5e1d9 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -73,29 +73,71 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static struct workqueue_struct *frwr_recovery_wq; - -#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) +bool +frwr_is_supported(struct rpcrdma_ia *ia) +{ + struct ib_device_attr *attrs = &ia->ri_device->attrs; + + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + goto out_not_supported; + if (attrs->max_fast_reg_page_list_len == 0) + goto out_not_supported; + return true; + +out_not_supported: + pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", + ia->ri_device->name); + return false; +} -int -frwr_alloc_recovery_wq(void) +static int +frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) { - frwr_recovery_wq = alloc_workqueue("frwr_recovery", - FRWR_RECOVERY_WQ_FLAGS, 0); - return !frwr_recovery_wq ? -ENOMEM : 0; + unsigned int depth = ia->ri_max_frmr_depth; + struct rpcrdma_frmr *f = &r->frmr; + int rc; + + f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + + r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); + if (!r->mw_sg) + goto out_list_err; + + sg_init_table(r->mw_sg, depth); + init_completion(&f->fr_linv_done); + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = -ENOMEM; + dprintk("RPC: %s: sg allocation failure\n", + __func__); + ib_dereg_mr(f->fr_mr); + return rc; } -void -frwr_destroy_recovery_wq(void) +static void +frwr_op_release_mr(struct rpcrdma_mw *r) { - struct workqueue_struct *wq; + int rc; - if (!frwr_recovery_wq) - return; + /* Ensure MW is not on any rl_registered list */ + if (!list_empty(&r->mw_list)) + list_del(&r->mw_list); - wq = frwr_recovery_wq; - frwr_recovery_wq = NULL; - destroy_workqueue(wq); + rc = ib_dereg_mr(r->frmr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + r, rc); + kfree(r->mw_sg); + kfree(r); } static int @@ -124,90 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) return 0; } -static void -__frwr_reset_and_unmap(struct rpcrdma_mw *mw) -{ - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc; - - rc = __frwr_reset_mr(ia, mw); - ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); - if (rc) - return; - rpcrdma_put_mw(r_xprt, mw); -} - -/* Deferred reset of a single FRMR. Generate a fresh rkey by - * replacing the MR. +/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. * * There's no recovery if this fails. The FRMR is abandoned, but * remains in rb_all. It will be cleaned up when the transport is * destroyed. */ static void -__frwr_recovery_worker(struct work_struct *work) +frwr_op_recover_mr(struct rpcrdma_mw *mw) { - struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, - mw_work); - - __frwr_reset_and_unmap(r); -} - -/* A broken MR was discovered in a context that can't sleep. - * Defer recovery to the recovery worker. - */ -static void -__frwr_queue_recovery(struct rpcrdma_mw *r) -{ - INIT_WORK(&r->mw_work, __frwr_recovery_worker); - queue_work(frwr_recovery_wq, &r->mw_work); -} - -static int -__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, unsigned int depth) -{ - struct rpcrdma_frmr *f = &r->frmr; + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; - f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); - if (IS_ERR(f->fr_mr)) - goto out_mr_err; - - r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); - if (!r->mw_sg) - goto out_list_err; - - sg_init_table(r->mw_sg, depth); - - init_completion(&f->fr_linv_done); - - return 0; + rc = __frwr_reset_mr(ia, mw); + ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (rc) + goto out_release; -out_mr_err: - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_mr status %i\n", - __func__, rc); - return rc; + rpcrdma_put_mw(r_xprt, mw); + r_xprt->rx_stats.mrs_recovered++; + return; -out_list_err: - rc = -ENOMEM; - dprintk("RPC: %s: sg allocation failure\n", - __func__); - ib_dereg_mr(f->fr_mr); - return rc; -} +out_release: + pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); + r_xprt->rx_stats.mrs_orphaned++; -static void -__frwr_release(struct rpcrdma_mw *r) -{ - int rc; + spin_lock(&r_xprt->rx_buf.rb_mwlock); + list_del(&mw->mw_all); + spin_unlock(&r_xprt->rx_buf.rb_mwlock); - rc = ib_dereg_mr(r->frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr status %i\n", - __func__, rc); - kfree(r->mw_sg); + frwr_op_release_mr(mw); } static int @@ -343,54 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) complete_all(&frmr->fr_linv_done); } -static int -frwr_op_init(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - int i; - - spin_lock_init(&buf->rb_mwlock); - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - - i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1); - i += 2; /* head + tail */ - i *= buf->rb_max_requests; /* one set for each RPC slot */ - dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); - - while (i--) { - struct rpcrdma_mw *r; - int rc; - - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (!r) - return -ENOMEM; - - rc = __frwr_init(r, pd, depth); - if (rc) { - kfree(r); - return rc; - } - - r->mw_xprt = r_xprt; - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; -} - /* Post a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ static int frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing) + int nsegs, bool writing, struct rpcrdma_mw **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; @@ -399,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int rc, i, n, dma_nents; u8 key; - mw = seg1->rl_mw; - seg1->rl_mw = NULL; + mw = NULL; do { if (mw) - __frwr_queue_recovery(mw); + rpcrdma_defer_mr_recovery(mw); mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOMEM; + return -ENOBUFS; } while (mw->frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->frmr; frmr->fr_state = FRMR_IS_VALID; @@ -435,6 +383,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, } mw->mw_nents = i; mw->mw_dir = rpcrdma_data_dir(writing); + if (i == 0) + goto out_dmamap_err; dma_nents = ib_dma_map_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); @@ -468,36 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (rc) goto out_senderr; - seg1->rl_mw = mw; - seg1->mr_rkey = mr->rkey; - seg1->mr_base = mr->iova; - seg1->mr_nsegs = mw->mw_nents; - seg1->mr_len = mr->length; + mw->mw_handle = mr->rkey; + mw->mw_length = mr->length; + mw->mw_offset = mr->iova; + *out = mw; return mw->mw_nents; out_dmamap_err: pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", mw->mw_sg, mw->mw_nents); - return -ENOMEM; + rpcrdma_defer_mr_recovery(mw); + return -EIO; out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%u/%u)\n", frmr->fr_mr, n, mw->mw_nents); - rc = n < 0 ? n : -EIO; - __frwr_queue_recovery(mw); - return rc; + rpcrdma_defer_mr_recovery(mw); + return -EIO; out_senderr: - pr_err("rpcrdma: ib_post_send status %i\n", rc); - __frwr_queue_recovery(mw); - return rc; + pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); + rpcrdma_defer_mr_recovery(mw); + return -ENOTCONN; } static struct ib_send_wr * -__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) +__frwr_prepare_linv_wr(struct rpcrdma_mw *mw) { - struct rpcrdma_mw *mw = seg->rl_mw; struct rpcrdma_frmr *f = &mw->frmr; struct ib_send_wr *invalidate_wr; @@ -517,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. + * + * Caller ensures that req->rl_registered is not empty. */ static void frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg; - unsigned int i, nchunks; + struct rpcrdma_mw *mw, *tmp; struct rpcrdma_frmr *f; - struct rpcrdma_mw *mw; int rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -536,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ + f = NULL; invalidate_wrs = pos = prev = NULL; - seg = NULL; - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - - pos = __frwr_prepare_linv_wr(seg); + list_for_each_entry(mw, &req->rl_registered, mw_list) { + pos = __frwr_prepare_linv_wr(mw); if (!invalidate_wrs) invalidate_wrs = pos; else prev->next = pos; prev = pos; - - i += seg->mr_nsegs; + f = &mw->frmr; } - f = &seg->rl_mw->frmr; /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain @@ -576,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * them to the free MW list. */ unmap: - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - seg->rl_mw = NULL; - + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->mw_list); ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; } - - req->rl_nchunks = 0; return; reset_mrs: - pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); + rdma_disconnect(ia->ri_id); /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. This is synchronous, and slow. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; + list_for_each_entry(mw, &req->rl_registered, mw_list) { f = &mw->frmr; - if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { __frwr_reset_mr(ia, mw); bad_wr = bad_wr->next; } - - i += seg->mr_nsegs; } goto unmap; } @@ -620,38 +552,17 @@ static void frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, bool sync) { - struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - unsigned int i; - for (i = 0; req->rl_nchunks; req->rl_nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; + while (!list_empty(&req->rl_registered)) { + mw = list_first_entry(&req->rl_registered, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); if (sync) - __frwr_reset_and_unmap(mw); + frwr_op_recover_mr(mw); else - __frwr_queue_recovery(mw); - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } -} - -static void -frwr_op_destroy(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - - /* Ensure stale MWs for "buf" are no longer in flight */ - flush_workqueue(frwr_recovery_wq); - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - __frwr_release(r); - kfree(r); + rpcrdma_defer_mr_recovery(mw); } } @@ -659,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap_safe = frwr_op_unmap_safe, + .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, - .ro_init = frwr_op_init, - .ro_destroy = frwr_op_destroy, + .ro_init_mr = frwr_op_init_mr, + .ro_release_mr = frwr_op_release_mr, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 35a81096e..a47f170b2 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf) * MR when they can. */ static int -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, - int n, int nsegs) +rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) { size_t page_offset; u32 remaining; @@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, base = vec->iov_base; page_offset = offset_in_page(base); remaining = vec->iov_len; - while (remaining && n < nsegs) { + while (remaining && n < RPCRDMA_MAX_SEGS) { seg[n].mr_page = NULL; seg[n].mr_offset = base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); @@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, static int rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) { - int len, n = 0, p; - int page_base; + int len, n, p, page_base; struct page **ppages; + n = 0; if (pos == 0) { - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); - if (n == nsegs) - return -EIO; + n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); + if (n == RPCRDMA_MAX_SEGS) + goto out_overflow; } len = xdrbuf->page_len; ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = xdrbuf->page_base & ~PAGE_MASK; p = 0; - while (len && n < nsegs) { + while (len && n < RPCRDMA_MAX_SEGS) { if (!ppages[p]) { /* alloc the pagelist for receiving buffer */ ppages[p] = alloc_page(GFP_ATOMIC); if (!ppages[p]) - return -ENOMEM; + return -EAGAIN; } seg[n].mr_page = ppages[p]; seg[n].mr_offset = (void *)(unsigned long) page_base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); if (seg[n].mr_len > PAGE_SIZE) - return -EIO; + goto out_overflow; len -= seg[n].mr_len; ++n; ++p; @@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, } /* Message overflows the seg array */ - if (len && n == nsegs) - return -EIO; + if (len && n == RPCRDMA_MAX_SEGS) + goto out_overflow; /* When encoding the read list, the tail is always sent inline */ if (type == rpcrdma_readch) @@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, * xdr pad bytes, saving the server an RDMA operation. */ if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) return n; - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); - if (n == nsegs) - return -EIO; + n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); + if (n == RPCRDMA_MAX_SEGS) + goto out_overflow; } return n; + +out_overflow: + pr_err("rpcrdma: segment array overflow\n"); + return -EIO; } static inline __be32 * -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) { - *iptr++ = cpu_to_be32(seg->mr_rkey); - *iptr++ = cpu_to_be32(seg->mr_len); - return xdr_encode_hyper(iptr, seg->mr_base); + *iptr++ = cpu_to_be32(mw->mw_handle); + *iptr++ = cpu_to_be32(mw->mw_length); + return xdr_encode_hyper(iptr, mw->mw_offset); } /* XDR-encode the Read list. Supports encoding a list of read @@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype rtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; unsigned int pos; int n, nsegs; @@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) pos = 0; - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + false, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); *iptr++ = xdr_one; /* item present */ @@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, * have the same "position". */ *iptr++ = cpu_to_be32(pos); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: read segment pos %u " - "%d@0x%016llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, pos, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.read_chunk_count++; - req->rl_nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Finish Read list */ *iptr++ = xdr_zero; /* Next item not present */ @@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype wtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; int n, nsegs, nchunks; __be32 *segcount; @@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return iptr; } + seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: write segment " - "%d@0x016%llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; - req->rl_nchunks++; nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); @@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype wtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; int n, nsegs, nchunks; __be32 *segcount; @@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return iptr; } - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: reply segment " - "%d@0x%016llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; - req->rl_nchunks++; nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); @@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; + bool ddp_allowed; ssize_t hdrlen; size_t rpclen; __be32 *iptr; @@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); headerp->rm_type = rdma_msg; + /* When the ULP employs a GSS flavor that guarantees integrity + * or privacy, direct data placement of individual data items + * is not allowed. + */ + ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & + RPCAUTH_AUTH_DATATOUCH); + /* * Chunks needed for results? * @@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rpcrdma_results_inline(r_xprt, rqst)) wtype = rpcrdma_noch; - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; else wtype = rpcrdma_replych; @@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) rtype = rpcrdma_noch; rpcrdma_inline_pullup(rqst); rpclen = rqst->rq_svec[0].iov_len; - } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; rpclen = rqst->rq_svec[0].iov_len; rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); @@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * send a Call message with a Position Zero Read chunk and a * regular Read chunk at the same time. */ - req->rl_nchunks = 0; - req->rl_nextseg = req->rl_segments; iptr = headerp->rm_body.rm_chunks; iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); if (IS_ERR(iptr)) @@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) out_overflow: pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); - /* Terminate this RPC. Chunks registered above will be - * released by xprt_release -> xprt_rmda_free . - */ - return -EIO; + iptr = ERR_PTR(-EIO); out_unmap: r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); @@ -705,15 +711,13 @@ out_unmap: * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) */ static int -rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) +rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) { unsigned int i, total_len; struct rpcrdma_write_chunk *cur_wchunk; char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); i = be32_to_cpu(**iptrp); - if (i > max) - return -1; cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); total_len = 0; while (i--) { @@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b return total_len; } -/* - * Scatter inline received data back into provided iov's. +/** + * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs + * @rqst: controlling RPC request + * @srcp: points to RPC message payload in receive buffer + * @copy_len: remaining length of receive buffer content + * @pad: Write chunk pad bytes needed (zero for pure inline) + * + * The upper layer has set the maximum number of bytes it can + * receive in each component of rq_rcv_buf. These values are set in + * the head.iov_len, page_len, tail.iov_len, and buflen fields. + * + * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in + * many cases this function simply updates iov_base pointers in + * rq_rcv_buf to point directly to the received reply data, to + * avoid copying reply data. + * + * Returns the count of bytes which had to be memcopied. */ -static void +static unsigned long rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { - int i, npages, curlen, olen; + unsigned long fixup_copy_count; + int i, npages, curlen; char *destp; struct page **ppages; int page_base; + /* The head iovec is redirected to the RPC reply message + * in the receive buffer, to avoid a memcopy. + */ + rqst->rq_rcv_buf.head[0].iov_base = srcp; + rqst->rq_private_buf.head[0].iov_base = srcp; + + /* The contents of the receive buffer that follow + * head.iov_len bytes are copied into the page list. + */ curlen = rqst->rq_rcv_buf.head[0].iov_len; - if (curlen > copy_len) { /* write chunk header fixup */ + if (curlen > copy_len) curlen = copy_len; - rqst->rq_rcv_buf.head[0].iov_len = curlen; - } - dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", __func__, srcp, copy_len, curlen); - - /* Shift pointer for first receive segment only */ - rqst->rq_rcv_buf.head[0].iov_base = srcp; srcp += curlen; copy_len -= curlen; - olen = copy_len; - i = 0; - rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; page_base = rqst->rq_rcv_buf.page_base; ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); page_base &= ~PAGE_MASK; - + fixup_copy_count = 0; if (copy_len && rqst->rq_rcv_buf.page_len) { - npages = PAGE_ALIGN(page_base + - rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; - for (; i < npages; i++) { + int pagelist_len; + + pagelist_len = rqst->rq_rcv_buf.page_len; + if (pagelist_len > copy_len) + pagelist_len = copy_len; + npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { curlen = PAGE_SIZE - page_base; - if (curlen > copy_len) - curlen = copy_len; + if (curlen > pagelist_len) + curlen = pagelist_len; + dprintk("RPC: %s: page %d" " srcp 0x%p len %d curlen %d\n", __func__, i, srcp, copy_len, curlen); @@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) kunmap_atomic(destp); srcp += curlen; copy_len -= curlen; - if (copy_len == 0) + fixup_copy_count += curlen; + pagelist_len -= curlen; + if (!pagelist_len) break; page_base = 0; } - } - if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { - curlen = copy_len; - if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) - curlen = rqst->rq_rcv_buf.tail[0].iov_len; - if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) - memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); - dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", - __func__, srcp, copy_len, curlen); - rqst->rq_rcv_buf.tail[0].iov_len = curlen; - copy_len -= curlen; ++i; - } else - rqst->rq_rcv_buf.tail[0].iov_len = 0; - - if (pad) { - /* implicit padding on terminal chunk */ - unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; - while (pad--) - p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + /* Implicit padding for the last segment in a Write + * chunk is inserted inline at the front of the tail + * iovec. The upper layer ignores the content of + * the pad. Simply ensure inline content in the tail + * that follows the Write chunk is properly aligned. + */ + if (pad) + srcp -= pad; } - if (copy_len) - dprintk("RPC: %s: %d bytes in" - " %d extra segments (%d lost)\n", - __func__, olen, i, copy_len); + /* The tail iovec is redirected to the remaining data + * in the receive buffer, to avoid a memcopy. + */ + if (copy_len || pad) { + rqst->rq_rcv_buf.tail[0].iov_base = srcp; + rqst->rq_private_buf.tail[0].iov_base = srcp; + } - /* TBD avoid a warning from call_decode() */ - rqst->rq_private_buf = rqst->rq_rcv_buf; + return fixup_copy_count; } void @@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) (headerp->rm_body.rm_chunks[1] == xdr_zero && headerp->rm_body.rm_chunks[2] != xdr_zero) || (headerp->rm_body.rm_chunks[1] != xdr_zero && - req->rl_nchunks == 0)) + list_empty(&req->rl_registered))) goto badheader; if (headerp->rm_body.rm_chunks[1] != xdr_zero) { /* count any expected write chunks in read reply */ /* start at write chunk array count */ iptr = &headerp->rm_body.rm_chunks[2]; - rdmalen = rpcrdma_count_chunks(rep, - req->rl_nchunks, 1, &iptr); + rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); /* check for validity, and no reply chunk after */ if (rdmalen < 0 || *iptr++ != xdr_zero) goto badheader; @@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) rep->rr_len -= RPCRDMA_HDRLEN_MIN; status = rep->rr_len; } - /* Fix up the rpc results for upper layer */ - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); + + r_xprt->rx_stats.fixup_copy_count += + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, + rdmalen); break; case rdma_nomsg: @@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (headerp->rm_body.rm_chunks[0] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero || headerp->rm_body.rm_chunks[2] != xdr_one || - req->rl_nchunks == 0) + list_empty(&req->rl_registered)) goto badheader; iptr = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); - rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); + rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); if (rdmalen < 0) goto badheader; r_xprt->rx_stats.total_rdma_reply += rdmalen; @@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) badheader: default: - dprintk("%s: invalid rpcrdma reply header (type %d):" - " chunks[012] == %d %d %d" - " expected chunks <= %d\n", - __func__, be32_to_cpu(headerp->rm_type), - headerp->rm_body.rm_chunks[0], - headerp->rm_body.rm_chunks[1], - headerp->rm_body.rm_chunks[2], - req->rl_nchunks); + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, + be32_to_cpu(headerp->rm_type)); status = -EIO; r_xprt->rx_stats.bad_reply_count++; break; @@ -1035,7 +1049,7 @@ out: * control: waking the next RPC waits until this RPC has * relinquished all its Send Queue entries. */ - if (req->rl_nchunks) + if (!list_empty(&req->rl_registered)) r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); spin_lock_bh(&xprt->transport_lock); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 99d2e5b72..81f0e879f 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -558,7 +558,6 @@ out_sendbuf: out_fail: rpcrdma_buffer_put(req); - r_xprt->rx_stats.failed_marshal_count++; return NULL; } @@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer) rpcrdma_buffer_put(req); } -/* +/** + * xprt_rdma_send_request - marshal and send an RPC request + * @task: RPC task with an RPC message in rq_snd_buf + * + * Return values: + * 0: The request has been sent + * ENOTCONN: Caller needs to invoke connect logic then call again + * ENOBUFS: Call again later to send the request + * EIO: A permanent error occurred. The request was not sent, + * and don't try it again + * * send_request invokes the meat of RPC RDMA. It must do the following: + * * 1. Marshal the RPC request into an RPC RDMA request, which means * putting a header in front of data, and creating IOVs for RDMA * from those in the request. @@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer) * the request (rpcrdma_ep_post). * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). */ - static int xprt_rdma_send_request(struct rpc_task *task) { @@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; + /* On retransmit, remove any previously registered chunks */ + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; @@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task) return 0; failed_marshal: - r_xprt->rx_stats.failed_marshal_count++; dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", __func__, rc); if (rc == -EIO) - return -EIO; + r_xprt->rx_stats.failed_marshal_count++; + if (rc != -ENOTCONN) + return rc; drop_connection: xprt_disconnect_done(xprt); return -ENOTCONN; /* implies disconnect */ @@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u); - seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ", r_xprt->rx_stats.read_chunk_count, r_xprt->rx_stats.write_chunk_count, r_xprt->rx_stats.reply_chunk_count, @@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); + seq_printf(seq, "%lu %lu %lu\n", + r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_orphaned, + r_xprt->rx_stats.mrs_allocated); } static int @@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void) __func__, rc); rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); rc = xprt_unregister_transport(&xprt_rdma_bc); if (rc) @@ -753,20 +769,13 @@ int xprt_rdma_init(void) { int rc; - rc = frwr_alloc_recovery_wq(); - if (rc) - return rc; - rc = rpcrdma_alloc_wq(); - if (rc) { - frwr_destroy_recovery_wq(); + if (rc) return rc; - } rc = xprt_register_transport(&xprt_rdma); if (rc) { rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); return rc; } @@ -774,7 +783,6 @@ int xprt_rdma_init(void) if (rc) { xprt_unregister_transport(&xprt_rdma); rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); return rc; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b044d98a1..799cce6cb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include /* try_module_get()/module_put() */ @@ -379,8 +380,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; - ia->ri_dma_mr = NULL; - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); @@ -391,47 +390,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_pd = ib_alloc_pd(ia->ri_device); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); - dprintk("RPC: %s: ib_alloc_pd() failed %i\n", - __func__, rc); + pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); goto out2; } - if (memreg == RPCRDMA_FRMR) { - if (!(ia->ri_device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS) || - (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { - dprintk("RPC: %s: FRMR registration " - "not supported by HCA\n", __func__); - memreg = RPCRDMA_MTHCAFMR; - } - } - if (memreg == RPCRDMA_MTHCAFMR) { - if (!ia->ri_device->alloc_fmr) { - dprintk("RPC: %s: MTHCAFMR registration " - "not supported by HCA\n", __func__); - rc = -EINVAL; - goto out3; - } - } - switch (memreg) { case RPCRDMA_FRMR: - ia->ri_ops = &rpcrdma_frwr_memreg_ops; - break; - case RPCRDMA_ALLPHYSICAL: - ia->ri_ops = &rpcrdma_physical_memreg_ops; - break; + if (frwr_is_supported(ia)) { + ia->ri_ops = &rpcrdma_frwr_memreg_ops; + break; + } + /*FALLTHROUGH*/ case RPCRDMA_MTHCAFMR: - ia->ri_ops = &rpcrdma_fmr_memreg_ops; - break; + if (fmr_is_supported(ia)) { + ia->ri_ops = &rpcrdma_fmr_memreg_ops; + break; + } + /*FALLTHROUGH*/ default: - printk(KERN_ERR "RPC: Unsupported memory " - "registration mode: %d\n", memreg); - rc = -ENOMEM; + pr_err("rpcrdma: Unsupported memory registration mode: %d\n", + memreg); + rc = -EINVAL; goto out3; } - dprintk("RPC: %s: memory registration strategy is '%s'\n", - __func__, ia->ri_ops->ro_displayname); return 0; @@ -585,8 +566,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, out2: ib_free_cq(sendcq); out1: - if (ia->ri_dma_mr) - ib_dereg_mr(ia->ri_dma_mr); return rc; } @@ -600,8 +579,6 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - int rc; - dprintk("RPC: %s: entering, connected is %d\n", __func__, ep->rep_connected); @@ -615,12 +592,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_free_cq(ep->rep_attr.recv_cq); ib_free_cq(ep->rep_attr.send_cq); - - if (ia->ri_dma_mr) { - rc = ib_dereg_mr(ia->ri_dma_mr); - dprintk("RPC: %s: ib_dereg_mr returned %i\n", - __func__, rc); - } } /* @@ -777,6 +748,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_drain_qp(ia->ri_id->qp); } +static void +rpcrdma_mr_recovery_worker(struct work_struct *work) +{ + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, + rb_recovery_worker.work); + struct rpcrdma_mw *mw; + + spin_lock(&buf->rb_recovery_lock); + while (!list_empty(&buf->rb_stale_mrs)) { + mw = list_first_entry(&buf->rb_stale_mrs, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); + spin_unlock(&buf->rb_recovery_lock); + + dprintk("RPC: %s: recovering MR %p\n", __func__, mw); + mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); + + spin_lock(&buf->rb_recovery_lock); + } + spin_unlock(&buf->rb_recovery_lock); +} + +void +rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) +{ + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + + spin_lock(&buf->rb_recovery_lock); + list_add(&mw->mw_list, &buf->rb_stale_mrs); + spin_unlock(&buf->rb_recovery_lock); + + schedule_delayed_work(&buf->rb_recovery_worker, 0); +} + +static void +rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int count; + LIST_HEAD(free); + LIST_HEAD(all); + + for (count = 0; count < 32; count++) { + struct rpcrdma_mw *mw; + int rc; + + mw = kzalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + break; + + rc = ia->ri_ops->ro_init_mr(ia, mw); + if (rc) { + kfree(mw); + break; + } + + mw->mw_xprt = r_xprt; + + list_add(&mw->mw_list, &free); + list_add(&mw->mw_all, &all); + } + + spin_lock(&buf->rb_mwlock); + list_splice(&free, &buf->rb_mws); + list_splice(&all, &buf->rb_all); + r_xprt->rx_stats.mrs_allocated += count; + spin_unlock(&buf->rb_mwlock); + + dprintk("RPC: %s: created %u MRs\n", __func__, count); +} + +static void +rpcrdma_mr_refresh_worker(struct work_struct *work) +{ + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, + rb_refresh_worker.work); + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, + rx_buf); + + rpcrdma_create_mrs(r_xprt); +} + struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { @@ -793,6 +848,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) spin_unlock(&buffer->rb_reqslock); req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; + INIT_LIST_HEAD(&req->rl_registered); return req; } @@ -832,17 +888,23 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; int i, rc; buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_lock); atomic_set(&buf->rb_credits, 1); + spin_lock_init(&buf->rb_mwlock); + spin_lock_init(&buf->rb_lock); + spin_lock_init(&buf->rb_recovery_lock); + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + INIT_LIST_HEAD(&buf->rb_stale_mrs); + INIT_DELAYED_WORK(&buf->rb_refresh_worker, + rpcrdma_mr_refresh_worker); + INIT_DELAYED_WORK(&buf->rb_recovery_worker, + rpcrdma_mr_recovery_worker); - rc = ia->ri_ops->ro_init(r_xprt); - if (rc) - goto out; + rpcrdma_create_mrs(r_xprt); INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); @@ -862,7 +924,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + 2; i++) { + for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) { struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); @@ -918,17 +980,46 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } +static void +rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, + rx_buf); + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *mw; + unsigned int count; + + count = 0; + spin_lock(&buf->rb_mwlock); + while (!list_empty(&buf->rb_all)) { + mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&mw->mw_all); + + spin_unlock(&buf->rb_mwlock); + ia->ri_ops->ro_release_mr(mw); + count++; + spin_lock(&buf->rb_mwlock); + } + spin_unlock(&buf->rb_mwlock); + r_xprt->rx_stats.mrs_allocated = 0; + + dprintk("RPC: %s: released %u MRs\n", __func__, count); +} + void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_ia *ia = rdmab_to_ia(buf); + cancel_delayed_work_sync(&buf->rb_recovery_worker); + while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; rep = rpcrdma_buffer_get_rep_locked(buf); rpcrdma_destroy_rep(ia, rep); } + buf->rb_send_count = 0; spin_lock(&buf->rb_reqslock); while (!list_empty(&buf->rb_allreqs)) { @@ -943,8 +1034,9 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) spin_lock(&buf->rb_reqslock); } spin_unlock(&buf->rb_reqslock); + buf->rb_recv_count = 0; - ia->ri_ops->ro_destroy(buf); + rpcrdma_destroy_mrs(buf); } struct rpcrdma_mw * @@ -962,8 +1054,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) spin_unlock(&buf->rb_mwlock); if (!mw) - pr_err("RPC: %s: no MWs available\n", __func__); + goto out_nomws; return mw; + +out_nomws: + dprintk("RPC: %s: no MWs available\n", __func__); + schedule_delayed_work(&buf->rb_refresh_worker, 0); + + /* Allow the reply handler and refresh worker to run */ + cond_resched(); + + return NULL; } void @@ -976,6 +1077,23 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) spin_unlock(&buf->rb_mwlock); } +static struct rpcrdma_rep * +rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers) +{ + /* If an RPC previously completed without a reply (say, a + * credential problem or a soft timeout occurs) then hold off + * on supplying more Receive buffers until the number of new + * pending RPCs catches up to the number of posted Receives. + */ + if (unlikely(buffers->rb_send_count < buffers->rb_recv_count)) + return NULL; + + if (unlikely(list_empty(&buffers->rb_recv_bufs))) + return NULL; + buffers->rb_recv_count++; + return rpcrdma_buffer_get_rep_locked(buffers); +} + /* * Get a set of request/reply buffers. * @@ -989,10 +1107,9 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_send_bufs)) goto out_reqbuf; + buffers->rb_send_count++; req = rpcrdma_buffer_get_req_locked(buffers); - if (list_empty(&buffers->rb_recv_bufs)) - goto out_repbuf; - req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); return req; @@ -1000,11 +1117,6 @@ out_reqbuf: spin_unlock(&buffers->rb_lock); pr_warn("RPC: %s: out of request buffers\n", __func__); return NULL; -out_repbuf: - spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of reply buffers\n", __func__); - req->rl_reply = NULL; - return req; } /* @@ -1021,9 +1133,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; spin_lock(&buffers->rb_lock); + buffers->rb_send_count--; list_add_tail(&req->rl_free, &buffers->rb_send_bufs); - if (rep) + if (rep) { + buffers->rb_recv_count--; list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); + } spin_unlock(&buffers->rb_lock); } @@ -1037,8 +1152,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; spin_lock(&buffers->rb_lock); - if (!list_empty(&buffers->rb_recv_bufs)) - req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); } @@ -1052,6 +1166,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; spin_lock(&buffers->rb_lock); + buffers->rb_recv_count--; list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); spin_unlock(&buffers->rb_lock); } @@ -1060,14 +1175,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ -void -rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) -{ - dprintk("RPC: map_one: offset %p iova %llx len %zu\n", - seg->mr_offset, - (unsigned long long)seg->mr_dma, seg->mr_dmalen); -} - /** * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers * @ia: controlling rpcrdma_ia @@ -1150,7 +1257,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, if (rep) { rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) - goto out; + return rc; req->rl_reply = NULL; } @@ -1175,10 +1282,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); if (rc) - dprintk("RPC: %s: ib_post_send returned %i\n", __func__, - rc); -out: - return rc; + goto out_postsend_err; + return 0; + +out_postsend_err: + pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); + return -ENOTCONN; } /* @@ -1203,11 +1312,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, DMA_BIDIRECTIONAL); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); - if (rc) - dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, - rc); - return rc; + goto out_postrecv; + return 0; + +out_postrecv: + pr_err("rpcrdma: ib_post_recv returned %i\n", rc); + return -ENOTCONN; } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c53abd128..a71b0f589 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -68,7 +68,6 @@ struct rpcrdma_ia { struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct ib_mr *ri_dma_mr; struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; @@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * o recv buffer (posted to provider) * o ib_sge (also donated to provider) * o status of reply (length, success or not) - * o bookkeeping state to get run by tasklet (list, etc) + * o bookkeeping state to get run by reply handler (list, etc) * - * These are allocated during initialization, per-transport instance; - * however, the tasklet execution list itself is global, as it should - * always be pretty short. + * These are allocated during initialization, per-transport instance. * * N of these are associated with a transport instance, and stored in * struct rpcrdma_buffer. N is the max number of outstanding requests. */ -#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) - -/* data segments + head/tail for Call + head/tail for Reply */ -#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4) - -struct rpcrdma_buffer; - struct rpcrdma_rep { struct ib_cqe rr_cqe; unsigned int rr_len; @@ -232,8 +222,8 @@ struct rpcrdma_frmr { }; struct rpcrdma_fmr { - struct ib_fmr *fmr; - u64 *physaddrs; + struct ib_fmr *fm_mr; + u64 *fm_physaddrs; }; struct rpcrdma_mw { @@ -245,8 +235,10 @@ struct rpcrdma_mw { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; }; - struct work_struct mw_work; struct rpcrdma_xprt *mw_xprt; + u32 mw_handle; + u32 mw_length; + u64 mw_offset; struct list_head mw_all; }; @@ -266,33 +258,30 @@ struct rpcrdma_mw { * of iovs for send operations. The reason is that the iovs passed to * ib_post_{send,recv} must not be modified until the work request * completes. - * - * NOTES: - * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we - * marshal. The number needed varies depending on the iov lists that - * are passed to us, the memory registration mode we are in, and if - * physical addressing is used, the layout. */ +/* Maximum number of page-sized "segments" per chunk list to be + * registered or invalidated. Must handle a Reply chunk: + */ +enum { + RPCRDMA_MAX_IOV_SEGS = 3, + RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, + RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + + RPCRDMA_MAX_IOV_SEGS, +}; + struct rpcrdma_mr_seg { /* chunk descriptors */ - struct rpcrdma_mw *rl_mw; /* registered MR */ - u64 mr_base; /* registration result */ - u32 mr_rkey; /* registration result */ u32 mr_len; /* length of chunk or segment */ - int mr_nsegs; /* number of segments in chunk or 0 */ - enum dma_data_direction mr_dir; /* segment mapping direction */ - dma_addr_t mr_dma; /* segment mapping address */ - size_t mr_dmalen; /* segment mapping length */ struct page *mr_page; /* owning page, if any */ char *mr_offset; /* kva if no page, else offset */ }; #define RPCRDMA_MAX_IOVS (2) +struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_free; unsigned int rl_niovs; - unsigned int rl_nchunks; unsigned int rl_connect_cookie; struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; @@ -300,12 +289,13 @@ struct rpcrdma_req { struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; struct rpcrdma_regbuf *rl_rdmabuf; struct rpcrdma_regbuf *rl_sendbuf; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; - struct rpcrdma_mr_seg *rl_nextseg; struct ib_cqe rl_cqe; struct list_head rl_all; bool rl_backchannel; + + struct list_head rl_registered; /* registered segments */ + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -331,6 +321,7 @@ struct rpcrdma_buffer { char *rb_pool; spinlock_t rb_lock; /* protect buf lists */ + int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; u32 rb_max_requests; @@ -341,6 +332,11 @@ struct rpcrdma_buffer { struct list_head rb_allreqs; u32 rb_bc_max_requests; + + spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ + struct list_head rb_stale_mrs; + struct delayed_work rb_recovery_worker; + struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -387,6 +383,9 @@ struct rpcrdma_stats { unsigned long bad_reply_count; unsigned long nomsg_call_count; unsigned long bcall_count; + unsigned long mrs_recovered; + unsigned long mrs_orphaned; + unsigned long mrs_allocated; }; /* @@ -395,23 +394,25 @@ struct rpcrdma_stats { struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, - struct rpcrdma_mr_seg *, int, bool); + struct rpcrdma_mr_seg *, int, bool, + struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct rpcrdma_req *); void (*ro_unmap_safe)(struct rpcrdma_xprt *, struct rpcrdma_req *, bool); + void (*ro_recover_mr)(struct rpcrdma_mw *); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); - int (*ro_init)(struct rpcrdma_xprt *); - void (*ro_destroy)(struct rpcrdma_buffer *); + int (*ro_init_mr)(struct rpcrdma_ia *, + struct rpcrdma_mw *); + void (*ro_release_mr)(struct rpcrdma_mw *); const char *ro_displayname; }; extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; -extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; /* * RPCRDMA transport -- encapsulates the structures above for @@ -446,6 +447,8 @@ extern int xprt_rdma_pad_optimize; */ int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); void rpcrdma_ia_close(struct rpcrdma_ia *); +bool frwr_is_supported(struct rpcrdma_ia *); +bool fmr_is_supported(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c @@ -477,6 +480,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); +void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); + struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, @@ -484,9 +489,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); -int frwr_alloc_recovery_wq(void); -void frwr_destroy_recovery_wq(void); - int rpcrdma_alloc_wq(void); void rpcrdma_destroy_wq(void); @@ -494,45 +496,12 @@ void rpcrdma_destroy_wq(void); * Wrappers for chunk registration, shared by read/write chunk code. */ -void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); - static inline enum dma_data_direction rpcrdma_data_dir(bool writing) { return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; } -static inline void -rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, - enum dma_data_direction direction) -{ - seg->mr_dir = direction; - seg->mr_dmalen = seg->mr_len; - - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - - if (ib_dma_mapping_error(device, seg->mr_dma)) - rpcrdma_mapping_error(seg); -} - -static inline void -rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 167cf5931..bf168838a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, - .extra2 = &xprt_max_resvport_limit + .extra2 = &xprt_max_resvport }, { .procname = "max_resvport", @@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xprt_min_resvport_limit, + .extra1 = &xprt_min_resvport, .extra2 = &xprt_max_resvport_limit }, { @@ -177,7 +177,6 @@ static struct ctl_table sunrpc_table[] = { * increase over time if the server is down or not responding. */ #define XS_TCP_INIT_REEST_TO (3U * HZ) -#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ) /* * TCP idle timeout; client drops the transport socket if it is idle @@ -642,6 +641,7 @@ static int xs_tcp_send_request(struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; bool zerocopy = true; + bool vm_wait = false; int status; int sent; @@ -677,15 +677,33 @@ static int xs_tcp_send_request(struct rpc_task *task) return 0; } + WARN_ON_ONCE(sent == 0 && status == 0); + + if (status == -EAGAIN ) { + /* + * Return EAGAIN if we're sure we're hitting the + * socket send buffer limits. + */ + if (test_bit(SOCK_NOSPACE, &transport->sock->flags)) + break; + /* + * Did we hit a memory allocation failure? + */ + if (sent == 0) { + status = -ENOBUFS; + if (vm_wait) + break; + /* Retry, knowing now that we're below the + * socket send buffer limit + */ + vm_wait = true; + } + continue; + } if (status < 0) break; - if (sent == 0) { - status = -EAGAIN; - break; - } + vm_wait = false; } - if (status == -EAGAIN && sk_stream_is_writeable(transport->inet)) - status = -ENOBUFS; switch (status) { case -ENOTSOCK: @@ -755,11 +773,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_error_report = transport->old_error_report; } +static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); +} + static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); + xs_sock_reset_state_flags(xprt); smp_mb__after_atomic(); } @@ -962,10 +988,13 @@ static void xs_local_data_receive(struct sock_xprt *transport) goto out; for (;;) { skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb == NULL) + if (skb != NULL) { + xs_local_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + continue; + } + if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; - xs_local_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); } out: mutex_unlock(&transport->recv_mutex); @@ -1043,10 +1072,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport) goto out; for (;;) { skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb == NULL) + if (skb != NULL) { + xs_udp_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram_locked(sk, skb); + continue; + } + if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; - xs_udp_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); } out: mutex_unlock(&transport->recv_mutex); @@ -1074,7 +1106,14 @@ static void xs_data_ready(struct sock *sk) if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - queue_work(rpciod_workqueue, &transport->recv_worker); + transport->old_data_ready(sk); + /* Any data means we had a useful conversation, so + * then we don't need to delay the next reconnect + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + queue_work(xprtiod_workqueue, &transport->recv_worker); } read_unlock_bh(&sk->sk_callback_lock); } @@ -1474,10 +1513,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) for (;;) { lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - release_sock(sk); - if (read <= 0) - break; - total += read; + if (read <= 0) { + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + release_sock(sk); + if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + break; + } else { + release_sock(sk); + total += read; + } rd_desc.count = 65536; } out: @@ -1492,34 +1536,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work) xs_tcp_data_receive(transport); } -/** - * xs_tcp_data_ready - "data ready" callback for TCP sockets - * @sk: socket with data to read - * - */ -static void xs_tcp_data_ready(struct sock *sk) -{ - struct sock_xprt *transport; - struct rpc_xprt *xprt; - - dprintk("RPC: xs_tcp_data_ready...\n"); - - read_lock_bh(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) - goto out; - transport = container_of(xprt, struct sock_xprt, xprt); - - /* Any data means we had a useful conversation, so - * the we don't need to delay the next reconnect - */ - if (xprt->reestablish_timeout) - xprt->reestablish_timeout = 0; - queue_work(rpciod_workqueue, &transport->recv_worker); - -out: - read_unlock_bh(&sk->sk_callback_lock); -} - /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1714,7 +1730,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) static unsigned short xs_get_random_port(void) { - unsigned short range = xprt_max_resvport - xprt_min_resvport; + unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; unsigned short rand = (unsigned short) prandom_u32() % range; return rand + xprt_min_resvport; } @@ -2156,6 +2172,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) write_unlock_bh(&sk->sk_callback_lock); } xs_udp_do_set_buffer_size(xprt); + + xprt->stat.connect_start = jiffies; } static void xs_udp_setup_socket(struct work_struct *work) @@ -2219,6 +2237,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) unsigned int keepcnt = xprt->timeout->to_retries + 1; unsigned int opt_on = 1; unsigned int timeo; + unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; /* TCP Keepalive options */ kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, @@ -2230,6 +2249,16 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); + /* Avoid temporary address, they are bad for long-lived + * connections such as NFS mounts. + * RFC4941, section 3.6 suggests that: + * Individual applications, which have specific + * knowledge about the normal duration of connections, + * MAY override this as appropriate. + */ + kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, + (char *)&addr_pref, sizeof(addr_pref)); + /* TCP user timeout (see RFC5482) */ timeo = jiffies_to_msecs(xprt->timeout->to_initval) * (xprt->timeout->to_retries + 1); @@ -2241,7 +2270,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; - sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_data_ready = xs_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sock_set_flag(sk, SOCK_FASYNC); @@ -2356,6 +2385,25 @@ out: xprt_wake_pending_tasks(xprt, status); } +static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt) +{ + unsigned long start, now = jiffies; + + start = xprt->stat.connect_start + xprt->reestablish_timeout; + if (time_after(start, now)) + return start - now; + return 0; +} + +static void xs_reconnect_backoff(struct rpc_xprt *xprt) +{ + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > xprt->max_reconnect_timeout) + xprt->reestablish_timeout = xprt->max_reconnect_timeout; + if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; +} + /** * xs_connect - connect a socket to a remote endpoint * @xprt: pointer to transport structure @@ -2373,6 +2421,7 @@ out: static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + unsigned long delay = 0; WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); @@ -2384,19 +2433,15 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) /* Start by resetting any existing state */ xs_reset_transport(transport); - queue_delayed_work(rpciod_workqueue, - &transport->connect_worker, - xprt->reestablish_timeout); - xprt->reestablish_timeout <<= 1; - if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) - xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; - if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO) - xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; - } else { + delay = xs_reconnect_delay(xprt); + xs_reconnect_backoff(xprt); + + } else dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - queue_delayed_work(rpciod_workqueue, - &transport->connect_worker, 0); - } + + queue_delayed_work(xprtiod_workqueue, + &transport->connect_worker, + delay); } /** @@ -2948,6 +2993,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->ops = &xs_tcp_ops; xprt->timeout = &xs_tcp_default_timeout; + xprt->max_reconnect_timeout = xprt->timeout->to_maxval; + INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); @@ -3157,8 +3204,12 @@ static int param_set_uint_minmax(const char *val, static int param_set_portnr(const char *val, const struct kernel_param *kp) { - return param_set_uint_minmax(val, kp, + if (kp->arg == &xprt_min_resvport) + return param_set_uint_minmax(val, kp, RPC_MIN_RESVPORT, + xprt_max_resvport); + return param_set_uint_minmax(val, kp, + xprt_min_resvport, RPC_MAX_RESVPORT); } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 59658b2e9..a5fc9dd24 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1286,8 +1286,8 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi) } EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); -static bool switchdev_port_same_parent_id(struct net_device *a, - struct net_device *b) +bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b) { struct switchdev_attr a_attr = { .orig_dev = a, @@ -1323,6 +1323,7 @@ static u32 switchdev_port_fwd_mark_get(struct net_device *dev, return dev->ifindex; } +EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, u32 old_mark, u32 *reset_mark) diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 57e460be4..31b9f9c52 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_TIPC) := tipc.o tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ - name_distr.o subscr.o name_table.o net.o \ + name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ server.o socket.o diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 93f7c983b..bebb34780 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -43,9 +43,6 @@ #include #include "core.h" -#define TIPC_ZONE_MASK 0xff000000u -#define TIPC_CLUSTER_MASK 0xfffff000u - static inline u32 tipc_own_addr(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); @@ -60,7 +57,7 @@ static inline u32 tipc_zone_mask(u32 addr) static inline u32 tipc_cluster_mask(u32 addr) { - return addr & TIPC_CLUSTER_MASK; + return addr & TIPC_ZONE_CLUSTER_MASK; } u32 tipc_own_addr(struct net *net); @@ -73,4 +70,5 @@ int tipc_addr_node_valid(u32 addr); int tipc_in_scope(u32 domain, u32 addr); int tipc_addr_scope(u32 domain); char *tipc_addr_string_fill(char *string, u32 addr); + #endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index a597708ae..65b1bbf13 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1,7 +1,7 @@ /* * net/tipc/bearer.c: TIPC bearer code * - * Copyright (c) 1996-2006, 2013-2014, Ericsson AB + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2004-2006, 2010-2013, Wind River Systems * All rights reserved. * @@ -39,6 +39,7 @@ #include "bearer.h" #include "link.h" #include "discover.h" +#include "monitor.h" #include "bcast.h" #include "netlink.h" @@ -170,6 +171,27 @@ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name) return NULL; } +/* tipc_bearer_get_name - get the bearer name from its id. + * @net: network namespace + * @name: a pointer to the buffer where the name will be stored. + * @bearer_id: the id to get the name from. + */ +int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_bearer *b; + + if (bearer_id >= MAX_BEARERS) + return -EINVAL; + + b = rtnl_dereference(tn->bearer_list[bearer_id]); + if (!b) + return -EINVAL; + + strcpy(name, b->name); + return 0; +} + void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) { struct tipc_net *tn = net_generic(net, tipc_net_id); @@ -224,7 +246,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, if (tipc_addr_domain_valid(disc_domain) && (disc_domain != tn->own_addr)) { if (tipc_in_scope(disc_domain, tn->own_addr)) { - disc_domain = tn->own_addr & TIPC_CLUSTER_MASK; + disc_domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK; res = 0; /* accept any node in own cluster */ } else if (in_own_cluster_exact(net, disc_domain)) res = 0; /* accept specified node in own cluster */ @@ -313,6 +335,10 @@ restart: rcu_assign_pointer(tn->bearer_list[bearer_id], b); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); + + if (tipc_mon_create(net, bearer_id)) + return -ENOMEM; + pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, tipc_addr_string_fill(addr_string, disc_domain), priority); @@ -363,6 +389,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) tipc_disc_delete(b->link_req); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); kfree_rcu(b, rcu); + tipc_mon_delete(net, bearer_id); } int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, @@ -826,7 +853,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) u32 prio; prio = TIPC_MEDIA_LINK_PRI; - domain = tn->own_addr & TIPC_CLUSTER_MASK; + domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK; if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 60e49c3be..43757f1f9 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -1,7 +1,7 @@ /* * net/tipc/bearer.h: Include file for TIPC bearer code * - * Copyright (c) 1996-2006, 2013-2014, Ericsson AB + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -197,6 +197,7 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest); void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest); struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name); +int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id); struct tipc_media *tipc_media_find(const char *name); void tipc_bearer_reset_all(struct net *net); int tipc_bearer_setup(void); diff --git a/net/tipc/core.c b/net/tipc/core.c index fe1b062c4..236b043a4 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -57,6 +57,7 @@ static int __net_init tipc_init_net(struct net *net) tn->net_id = 4711; tn->own_addr = 0; + tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); diff --git a/net/tipc/core.h b/net/tipc/core.h index eff58dc53..a1845fb27 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -66,11 +66,13 @@ struct tipc_bc_base; struct tipc_link; struct tipc_name_table; struct tipc_server; +struct tipc_monitor; #define TIPC_MOD_VER "2.0.0" -#define NODE_HTABLE_SIZE 512 -#define MAX_BEARERS 3 +#define NODE_HTABLE_SIZE 512 +#define MAX_BEARERS 3 +#define TIPC_DEF_MON_THRESHOLD 32 extern int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; @@ -88,6 +90,10 @@ struct tipc_net { u32 num_nodes; u32 num_links; + /* Neighbor monitoring list */ + struct tipc_monitor *monitors[MAX_BEARERS]; + int mon_threshold; + /* Bearer list */ struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1]; @@ -126,6 +132,11 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } +static inline unsigned int tipc_hashfn(u32 addr) +{ + return addr & (NODE_HTABLE_SIZE - 1); +} + static inline u16 mod(u16 x) { return x & 0xffffu; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index ad9d477cc..6b109a808 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -135,9 +135,12 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, u16 caps = msg_node_capabilities(hdr); bool respond = false; bool dupl_addr = false; + int err; - bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr)); + err = bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr)); kfree_skb(skb); + if (err) + return; /* Ensure message from node is valid and communication is permitted */ if (net_id != tn->net_id) diff --git a/net/tipc/link.c b/net/tipc/link.c index 7d89f8713..877d94f34 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -42,6 +42,7 @@ #include "name_distr.h" #include "discover.h" #include "netlink.h" +#include "monitor.h" #include @@ -87,7 +88,6 @@ struct tipc_stats { * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link * @tolerance: minimum link continuity loss needed to reset link [in ms] - * @keepalive_intv: link keepalive timer interval * @abort_limit: # of unacknowledged continuity probes needed to reset link * @state: current state of link FSM * @peer_caps: bitmap describing capabilities of peer node @@ -96,6 +96,7 @@ struct tipc_stats { * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') + * @mon_state: cookie with information needed by link monitor * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset @@ -131,7 +132,6 @@ struct tipc_link { u32 peer_bearer_id; u32 bearer_id; u32 tolerance; - unsigned long keepalive_intv; u32 abort_limit; u32 state; u16 peer_caps; @@ -140,6 +140,7 @@ struct tipc_link { char if_name[TIPC_MAX_IF_NAME]; u32 priority; char net_plane; + struct tipc_mon_state mon_state; u16 rst_cnt; /* Failover/synch */ @@ -713,18 +714,25 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) bool setup = false; u16 bc_snt = l->bc_sndlink->snd_nxt - 1; u16 bc_acked = l->bc_rcvlink->acked; - - link_profile_stats(l); + struct tipc_mon_state *mstate = &l->mon_state; switch (l->state) { case LINK_ESTABLISHED: case LINK_SYNCHING: - if (l->silent_intv_cnt > l->abort_limit) - return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); mtyp = STATE_MSG; + link_profile_stats(l); + tipc_mon_get_state(l->net, l->addr, mstate, l->bearer_id); + if (mstate->reset || (l->silent_intv_cnt > l->abort_limit)) + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); state = bc_acked != bc_snt; - probe = l->silent_intv_cnt; - l->silent_intv_cnt++; + state |= l->bc_rcvlink->rcv_unacked; + state |= l->rcv_unacked; + state |= !skb_queue_empty(&l->transmq); + state |= !skb_queue_empty(&l->deferdq); + probe = mstate->probing; + probe |= l->silent_intv_cnt; + if (probe || mstate->monitoring) + l->silent_intv_cnt++; break; case LINK_RESET: setup = l->rst_cnt++ <= 4; @@ -835,6 +843,7 @@ void tipc_link_reset(struct tipc_link *l) l->stats.recv_info = 0; l->stale_count = 0; l->bc_peer_is_up = false; + memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); } @@ -1243,6 +1252,9 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, struct tipc_msg *hdr; struct sk_buff_head *dfq = &l->deferdq; bool node_up = link_is_up(l->bc_rcvlink); + struct tipc_mon_state *mstate = &l->mon_state; + int dlen = 0; + void *data; /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) @@ -1255,12 +1267,13 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, - TIPC_MAX_IF_NAME, l->addr, + tipc_max_domain_size, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return; hdr = buf_msg(skb); + data = msg_data(hdr); msg_set_session(hdr, l->session); msg_set_bearer_id(hdr, l->bearer_id); msg_set_net_plane(hdr, l->net_plane); @@ -1276,14 +1289,18 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (mtyp == STATE_MSG) { msg_set_seq_gap(hdr, rcvgap); - msg_set_size(hdr, INT_H_SIZE); msg_set_probe(hdr, probe); + tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); + msg_set_size(hdr, INT_H_SIZE + dlen); + skb_trim(skb, INT_H_SIZE + dlen); l->stats.sent_states++; l->rcv_unacked = 0; } else { /* RESET_MSG or ACTIVATE_MSG */ msg_set_max_pkt(hdr, l->advertised_mtu); - strcpy(msg_data(hdr), l->if_name); + strcpy(data, l->if_name); + msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME); + skb_trim(skb, INT_H_SIZE + TIPC_MAX_IF_NAME); } if (probe) l->stats.sent_probes++; @@ -1376,7 +1393,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); u16 rcv_nxt = l->rcv_nxt; + u16 dlen = msg_data_sz(hdr); int mtyp = msg_type(hdr); + void *data; char *if_name; int rc = 0; @@ -1386,6 +1405,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); + skb_linearize(skb); + hdr = buf_msg(skb); + data = msg_data(hdr); + switch (mtyp) { case RESET_MSG: @@ -1396,8 +1419,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* fall thru' */ case ACTIVATE_MSG: - skb_linearize(skb); - hdr = buf_msg(skb); /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; @@ -1405,7 +1426,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) break; - strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME); + strncpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) @@ -1453,6 +1474,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = TIPC_LINK_UP_EVT; break; } + tipc_mon_rcv(l->net, data, dlen, l->addr, + &l->mon_state, l->bearer_id); /* Send NACK if peer has sent pkts we haven't received yet */ if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c new file mode 100644 index 000000000..ed97a5876 --- /dev/null +++ b/net/tipc/monitor.c @@ -0,0 +1,804 @@ +/* + * net/tipc/monitor.c + * + * Copyright (c) 2016, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "core.h" +#include "addr.h" +#include "monitor.h" +#include "bearer.h" + +#define MAX_MON_DOMAIN 64 +#define MON_TIMEOUT 120000 +#define MAX_PEER_DOWN_EVENTS 4 + +/* struct tipc_mon_domain: domain record to be transferred between peers + * @len: actual size of domain record + * @gen: current generation of sender's domain + * @ack_gen: most recent generation of self's domain acked by peer + * @member_cnt: number of domain member nodes described in this record + * @up_map: bit map indicating which of the members the sender considers up + * @members: identity of the domain members + */ +struct tipc_mon_domain { + u16 len; + u16 gen; + u16 ack_gen; + u16 member_cnt; + u64 up_map; + u32 members[MAX_MON_DOMAIN]; +}; + +/* struct tipc_peer: state of a peer node and its domain + * @addr: tipc node identity of peer + * @head_map: shows which other nodes currently consider peer 'up' + * @domain: most recent domain record from peer + * @hash: position in hashed lookup list + * @list: position in linked list, in circular ascending order by 'addr' + * @applied: number of reported domain members applied on this monitor list + * @is_up: peer is up as seen from this node + * @is_head: peer is assigned domain head as seen from this node + * @is_local: peer is in local domain and should be continuously monitored + * @down_cnt: - numbers of other peers which have reported this on lost + */ +struct tipc_peer { + u32 addr; + struct tipc_mon_domain *domain; + struct hlist_node hash; + struct list_head list; + u8 applied; + u8 down_cnt; + bool is_up; + bool is_head; + bool is_local; +}; + +struct tipc_monitor { + struct hlist_head peers[NODE_HTABLE_SIZE]; + int peer_cnt; + struct tipc_peer *self; + rwlock_t lock; + struct tipc_mon_domain cache; + u16 list_gen; + u16 dom_gen; + struct net *net; + struct timer_list timer; + unsigned long timer_intv; +}; + +static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) +{ + return tipc_net(net)->monitors[bearer_id]; +} + +const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); + +/* dom_rec_len(): actual length of domain record for transport + */ +static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) +{ + return ((void *)&dom->members - (void *)dom) + (mcnt * sizeof(u32)); +} + +/* dom_size() : calculate size of own domain based on number of peers + */ +static int dom_size(int peers) +{ + int i = 0; + + while ((i * i) < peers) + i++; + return i < MAX_MON_DOMAIN ? i : MAX_MON_DOMAIN; +} + +static void map_set(u64 *up_map, int i, unsigned int v) +{ + *up_map &= ~(1ULL << i); + *up_map |= ((u64)v << i); +} + +static int map_get(u64 up_map, int i) +{ + return (up_map & (1 << i)) >> i; +} + +static struct tipc_peer *peer_prev(struct tipc_peer *peer) +{ + return list_last_entry(&peer->list, struct tipc_peer, list); +} + +static struct tipc_peer *peer_nxt(struct tipc_peer *peer) +{ + return list_first_entry(&peer->list, struct tipc_peer, list); +} + +static struct tipc_peer *peer_head(struct tipc_peer *peer) +{ + while (!peer->is_head) + peer = peer_prev(peer); + return peer; +} + +static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr) +{ + struct tipc_peer *peer; + unsigned int thash = tipc_hashfn(addr); + + hlist_for_each_entry(peer, &mon->peers[thash], hash) { + if (peer->addr == addr) + return peer; + } + return NULL; +} + +static struct tipc_peer *get_self(struct net *net, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + + return mon->self; +} + +static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon) +{ + struct tipc_net *tn = tipc_net(net); + + return mon->peer_cnt > tn->mon_threshold; +} + +/* mon_identify_lost_members() : - identify amd mark potentially lost members + */ +static void mon_identify_lost_members(struct tipc_peer *peer, + struct tipc_mon_domain *dom_bef, + int applied_bef) +{ + struct tipc_peer *member = peer; + struct tipc_mon_domain *dom_aft = peer->domain; + int applied_aft = peer->applied; + int i; + + for (i = 0; i < applied_bef; i++) { + member = peer_nxt(member); + + /* Do nothing if self or peer already see member as down */ + if (!member->is_up || !map_get(dom_bef->up_map, i)) + continue; + + /* Loss of local node must be detected by active probing */ + if (member->is_local) + continue; + + /* Start probing if member was removed from applied domain */ + if (!applied_aft || (applied_aft < i)) { + member->down_cnt = 1; + continue; + } + + /* Member loss is confirmed if it is still in applied domain */ + if (!map_get(dom_aft->up_map, i)) + member->down_cnt++; + } +} + +/* mon_apply_domain() : match a peer's domain record against monitor list + */ +static void mon_apply_domain(struct tipc_monitor *mon, + struct tipc_peer *peer) +{ + struct tipc_mon_domain *dom = peer->domain; + struct tipc_peer *member; + u32 addr; + int i; + + if (!dom || !peer->is_up) + return; + + /* Scan across domain members and match against monitor list */ + peer->applied = 0; + member = peer_nxt(peer); + for (i = 0; i < dom->member_cnt; i++) { + addr = dom->members[i]; + if (addr != member->addr) + return; + peer->applied++; + member = peer_nxt(member); + } +} + +/* mon_update_local_domain() : update after peer addition/removal/up/down + */ +static void mon_update_local_domain(struct tipc_monitor *mon) +{ + struct tipc_peer *self = mon->self; + struct tipc_mon_domain *cache = &mon->cache; + struct tipc_mon_domain *dom = self->domain; + struct tipc_peer *peer = self; + u64 prev_up_map = dom->up_map; + u16 member_cnt, i; + bool diff; + + /* Update local domain size based on current size of cluster */ + member_cnt = dom_size(mon->peer_cnt) - 1; + self->applied = member_cnt; + + /* Update native and cached outgoing local domain records */ + dom->len = dom_rec_len(dom, member_cnt); + diff = dom->member_cnt != member_cnt; + dom->member_cnt = member_cnt; + for (i = 0; i < member_cnt; i++) { + peer = peer_nxt(peer); + diff |= dom->members[i] != peer->addr; + dom->members[i] = peer->addr; + map_set(&dom->up_map, i, peer->is_up); + cache->members[i] = htonl(peer->addr); + } + diff |= dom->up_map != prev_up_map; + if (!diff) + return; + dom->gen = ++mon->dom_gen; + cache->len = htons(dom->len); + cache->gen = htons(dom->gen); + cache->member_cnt = htons(member_cnt); + cache->up_map = cpu_to_be64(dom->up_map); + mon_apply_domain(mon, self); +} + +/* mon_update_neighbors() : update preceding neighbors of added/removed peer + */ +static void mon_update_neighbors(struct tipc_monitor *mon, + struct tipc_peer *peer) +{ + int dz, i; + + dz = dom_size(mon->peer_cnt); + for (i = 0; i < dz; i++) { + mon_apply_domain(mon, peer); + peer = peer_prev(peer); + } +} + +/* mon_assign_roles() : reassign peer roles after a network change + * The monitor list is consistent at this stage; i.e., each peer is monitoring + * a set of domain members as matched between domain record and the monitor list + */ +static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head) +{ + struct tipc_peer *peer = peer_nxt(head); + struct tipc_peer *self = mon->self; + int i = 0; + + for (; peer != self; peer = peer_nxt(peer)) { + peer->is_local = false; + + /* Update domain member */ + if (i++ < head->applied) { + peer->is_head = false; + if (head == self) + peer->is_local = true; + continue; + } + /* Assign next domain head */ + if (!peer->is_up) + continue; + if (peer->is_head) + break; + head = peer; + head->is_head = true; + i = 0; + } + mon->list_gen++; +} + +void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *prev, *head; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer) + goto exit; + prev = peer_prev(peer); + list_del(&peer->list); + hlist_del(&peer->hash); + kfree(peer->domain); + kfree(peer); + mon->peer_cnt--; + head = peer_head(prev); + if (head == self) + mon_update_local_domain(mon); + mon_update_neighbors(mon, prev); + + /* Revert to full-mesh monitoring if we reach threshold */ + if (!tipc_mon_is_active(net, mon)) { + list_for_each_entry(peer, &self->list, list) { + kfree(peer->domain); + peer->domain = NULL; + peer->applied = 0; + } + } + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, + struct tipc_peer **peer) +{ + struct tipc_peer *self = mon->self; + struct tipc_peer *cur, *prev, *p; + + p = kzalloc(sizeof(*p), GFP_ATOMIC); + *peer = p; + if (!p) + return false; + p->addr = addr; + + /* Add new peer to lookup list */ + INIT_LIST_HEAD(&p->list); + hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]); + + /* Sort new peer into iterator list, in ascending circular order */ + prev = self; + list_for_each_entry(cur, &self->list, list) { + if ((addr > prev->addr) && (addr < cur->addr)) + break; + if (((addr < cur->addr) || (addr > prev->addr)) && + (prev->addr > cur->addr)) + break; + prev = cur; + } + list_add_tail(&p->list, &cur->list); + mon->peer_cnt++; + mon_update_neighbors(mon, p); + return true; +} + +void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *head; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer && !tipc_mon_add_peer(mon, addr, &peer)) + goto exit; + peer->is_up = true; + head = peer_head(peer); + if (head == self) + mon_update_local_domain(mon); + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *head; + struct tipc_mon_domain *dom; + int applied; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer) { + pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id); + goto exit; + } + applied = peer->applied; + peer->applied = 0; + dom = peer->domain; + peer->domain = NULL; + if (peer->is_head) + mon_identify_lost_members(peer, dom, applied); + kfree(dom); + peer->is_up = false; + peer->is_head = false; + peer->is_local = false; + peer->down_cnt = 0; + head = peer_head(peer); + if (head == self) + mon_update_local_domain(mon); + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +/* tipc_mon_rcv - process monitor domain event message + */ +void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, + struct tipc_mon_state *state, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_mon_domain *arrv_dom = data; + struct tipc_mon_domain dom_bef; + struct tipc_mon_domain *dom; + struct tipc_peer *peer; + u16 new_member_cnt = ntohs(arrv_dom->member_cnt); + int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); + u16 new_gen = ntohs(arrv_dom->gen); + u16 acked_gen = ntohs(arrv_dom->ack_gen); + bool probing = state->probing; + int i, applied_bef; + + state->probing = false; + if (!dlen) + return; + + /* Sanity check received domain record */ + if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) { + pr_warn_ratelimited("Received illegal domain record\n"); + return; + } + + /* Synch generation numbers with peer if link just came up */ + if (!state->synched) { + state->peer_gen = new_gen - 1; + state->acked_gen = acked_gen; + state->synched = true; + } + + if (more(acked_gen, state->acked_gen)) + state->acked_gen = acked_gen; + + /* Drop duplicate unless we are waiting for a probe response */ + if (!more(new_gen, state->peer_gen) && !probing) + return; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer || !peer->is_up) + goto exit; + + /* Peer is confirmed, stop any ongoing probing */ + peer->down_cnt = 0; + + /* Task is done for duplicate record */ + if (!more(new_gen, state->peer_gen)) + goto exit; + + state->peer_gen = new_gen; + + /* Cache current domain record for later use */ + dom_bef.member_cnt = 0; + dom = peer->domain; + if (dom) + memcpy(&dom_bef, dom, dom->len); + + /* Transform and store received domain record */ + if (!dom || (dom->len < new_dlen)) { + kfree(dom); + dom = kmalloc(new_dlen, GFP_ATOMIC); + peer->domain = dom; + if (!dom) + goto exit; + } + dom->len = new_dlen; + dom->gen = new_gen; + dom->member_cnt = new_member_cnt; + dom->up_map = be64_to_cpu(arrv_dom->up_map); + for (i = 0; i < new_member_cnt; i++) + dom->members[i] = ntohl(arrv_dom->members[i]); + + /* Update peers affected by this domain record */ + applied_bef = peer->applied; + mon_apply_domain(mon, peer); + mon_identify_lost_members(peer, &dom_bef, applied_bef); + mon_assign_roles(mon, peer_head(peer)); +exit: + write_unlock_bh(&mon->lock); +} + +void tipc_mon_prep(struct net *net, void *data, int *dlen, + struct tipc_mon_state *state, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_mon_domain *dom = data; + u16 gen = mon->dom_gen; + u16 len; + + if (!tipc_mon_is_active(net, mon)) + return; + + /* Send only a dummy record with ack if peer has acked our last sent */ + if (likely(state->acked_gen == gen)) { + len = dom_rec_len(dom, 0); + *dlen = len; + dom->len = htons(len); + dom->gen = htons(gen); + dom->ack_gen = htons(state->peer_gen); + dom->member_cnt = 0; + return; + } + /* Send the full record */ + read_lock_bh(&mon->lock); + len = ntohs(mon->cache.len); + *dlen = len; + memcpy(data, &mon->cache, len); + read_unlock_bh(&mon->lock); + dom->ack_gen = htons(state->peer_gen); +} + +void tipc_mon_get_state(struct net *net, u32 addr, + struct tipc_mon_state *state, + int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *peer; + + /* Used cached state if table has not changed */ + if (!state->probing && + (state->list_gen == mon->list_gen) && + (state->acked_gen == mon->dom_gen)) + return; + + read_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (peer) { + state->probing = state->acked_gen != mon->dom_gen; + state->probing |= peer->down_cnt; + state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS; + state->monitoring = peer->is_local; + state->monitoring |= peer->is_head; + state->list_gen = mon->list_gen; + } + read_unlock_bh(&mon->lock); +} + +static void mon_timeout(unsigned long m) +{ + struct tipc_monitor *mon = (void *)m; + struct tipc_peer *self; + int best_member_cnt = dom_size(mon->peer_cnt) - 1; + + write_lock_bh(&mon->lock); + self = mon->self; + if (self && (best_member_cnt != self->applied)) { + mon_update_local_domain(mon); + mon_assign_roles(mon, self); + } + write_unlock_bh(&mon->lock); + mod_timer(&mon->timer, jiffies + mon->timer_intv); +} + +int tipc_mon_create(struct net *net, int bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_monitor *mon; + struct tipc_peer *self; + struct tipc_mon_domain *dom; + + if (tn->monitors[bearer_id]) + return 0; + + mon = kzalloc(sizeof(*mon), GFP_ATOMIC); + self = kzalloc(sizeof(*self), GFP_ATOMIC); + dom = kzalloc(sizeof(*dom), GFP_ATOMIC); + if (!mon || !self || !dom) { + kfree(mon); + kfree(self); + kfree(dom); + return -ENOMEM; + } + tn->monitors[bearer_id] = mon; + rwlock_init(&mon->lock); + mon->net = net; + mon->peer_cnt = 1; + mon->self = self; + self->domain = dom; + self->addr = tipc_own_addr(net); + self->is_up = true; + self->is_head = true; + INIT_LIST_HEAD(&self->list); + setup_timer(&mon->timer, mon_timeout, (unsigned long)mon); + mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); + mod_timer(&mon->timer, jiffies + mon->timer_intv); + return 0; +} + +void tipc_mon_delete(struct net *net, int bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *tmp; + + write_lock_bh(&mon->lock); + tn->monitors[bearer_id] = NULL; + list_for_each_entry_safe(peer, tmp, &self->list, list) { + list_del(&peer->list); + hlist_del(&peer->hash); + kfree(peer->domain); + kfree(peer); + } + mon->self = NULL; + write_unlock_bh(&mon->lock); + del_timer_sync(&mon->timer); + kfree(self->domain); + kfree(self); + kfree(mon); +} + +int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) +{ + struct tipc_net *tn = tipc_net(net); + + if (cluster_size > TIPC_CLUSTER_SIZE) + return -EINVAL; + + tn->mon_threshold = cluster_size; + + return 0; +} + +int tipc_nl_monitor_get_threshold(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); + + return tn->mon_threshold; +} + +int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) +{ + struct tipc_mon_domain *dom = peer->domain; + struct nlattr *attrs; + void *hdr; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + NLM_F_MULTI, TIPC_NL_MON_PEER_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_MON_PEER); + if (!attrs) + goto msg_full; + + if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_ADDR, peer->addr)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_APPLIED, peer->applied)) + goto attr_msg_full; + + if (peer->is_up) + if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_UP)) + goto attr_msg_full; + if (peer->is_local) + if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_LOCAL)) + goto attr_msg_full; + if (peer->is_head) + if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_HEAD)) + goto attr_msg_full; + + if (dom) { + if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_DOMGEN, dom->gen)) + goto attr_msg_full; + if (nla_put_u64_64bit(msg->skb, TIPC_NLA_MON_PEER_UPMAP, + dom->up_map, TIPC_NLA_MON_PEER_PAD)) + goto attr_msg_full; + if (nla_put(msg->skb, TIPC_NLA_MON_PEER_MEMBERS, + dom->member_cnt * sizeof(u32), &dom->members)) + goto attr_msg_full; + } + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, + u32 bearer_id, u32 *prev_node) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *peer; + + if (!mon) + return -EINVAL; + + read_lock_bh(&mon->lock); + peer = mon->self; + do { + if (*prev_node) { + if (peer->addr == *prev_node) + *prev_node = 0; + else + continue; + } + if (__tipc_nl_add_monitor_peer(peer, msg)) { + *prev_node = peer->addr; + read_unlock_bh(&mon->lock); + return -EMSGSIZE; + } + } while ((peer = peer_nxt(peer)) != mon->self); + read_unlock_bh(&mon->lock); + + return 0; +} + +int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, + u32 bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + char bearer_name[TIPC_MAX_BEARER_NAME]; + struct nlattr *attrs; + void *hdr; + int ret; + + ret = tipc_bearer_get_name(net, bearer_name, bearer_id); + if (ret || !mon) + return -EINVAL; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + NLM_F_MULTI, TIPC_NL_MON_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_MON); + if (!attrs) + goto msg_full; + + read_lock_bh(&mon->lock); + if (nla_put_u32(msg->skb, TIPC_NLA_MON_REF, bearer_id)) + goto attr_msg_full; + if (tipc_mon_is_active(net, mon)) + if (nla_put_flag(msg->skb, TIPC_NLA_MON_ACTIVE)) + goto attr_msg_full; + if (nla_put_string(msg->skb, TIPC_NLA_MON_BEARER_NAME, bearer_name)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEERCNT, mon->peer_cnt)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_MON_LISTGEN, mon->list_gen)) + goto attr_msg_full; + + read_unlock_bh(&mon->lock); + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + read_unlock_bh(&mon->lock); + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h new file mode 100644 index 000000000..2a21b93e0 --- /dev/null +++ b/net/tipc/monitor.h @@ -0,0 +1,82 @@ +/* + * net/tipc/monitor.h + * + * Copyright (c) 2015, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_MONITOR_H +#define _TIPC_MONITOR_H + +#include "netlink.h" + +/* struct tipc_mon_state: link instance's cache of monitor list and domain state + * @list_gen: current generation of this node's monitor list + * @gen: current generation of this node's local domain + * @peer_gen: most recent domain generation received from peer + * @acked_gen: most recent generation of self's domain acked by peer + * @monitoring: this peer endpoint should continuously monitored + * @probing: peer endpoint should be temporarily probed for potential loss + * @synched: domain record's generation has been synched with peer after reset + */ +struct tipc_mon_state { + u16 list_gen; + u16 peer_gen; + u16 acked_gen; + bool monitoring :1; + bool probing :1; + bool reset :1; + bool synched :1; +}; + +int tipc_mon_create(struct net *net, int bearer_id); +void tipc_mon_delete(struct net *net, int bearer_id); + +void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id); +void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id); +void tipc_mon_prep(struct net *net, void *data, int *dlen, + struct tipc_mon_state *state, int bearer_id); +void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, + struct tipc_mon_state *state, int bearer_id); +void tipc_mon_get_state(struct net *net, u32 addr, + struct tipc_mon_state *state, + int bearer_id); +void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id); + +int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size); +int tipc_nl_monitor_get_threshold(struct net *net); +int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, + u32 bearer_id); +int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, + u32 bearer_id, u32 *prev_node); + +extern const int tipc_max_domain_size; +#endif diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 6b626a64b..a04fe9be1 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -62,6 +62,8 @@ static void publ_to_item(struct distr_item *i, struct publication *p) /** * named_prepare_buf - allocate & initialize a publication message + * + * The buffer returned is of size INT_H_SIZE + payload size */ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) @@ -141,9 +143,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - uint msg_dsz = (tipc_node_get_mtu(net, dnode, 0) / ITEM_SIZE) * - ITEM_SIZE; - uint msg_rem = msg_dsz; + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + ITEM_SIZE) * ITEM_SIZE; + u32 msg_rem = msg_dsz; list_for_each_entry(publ, pls, local_list) { /* Prepare next buffer: */ diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 56935df21..a84daec0a 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -52,7 +52,8 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_MEDIA] = { .type = NLA_NESTED, }, [TIPC_NLA_NODE] = { .type = NLA_NESTED, }, [TIPC_NLA_NET] = { .type = NLA_NESTED, }, - [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, } + [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, }, + [TIPC_NLA_MON] = { .type = NLA_NESTED, }, }; const struct nla_policy @@ -61,6 +62,12 @@ tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } }; +const struct nla_policy tipc_nl_monitor_policy[TIPC_NLA_MON_MAX + 1] = { + [TIPC_NLA_MON_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_MON_REF] = { .type = NLA_U32 }, + [TIPC_NLA_MON_ACTIVATION_THRESHOLD] = { .type = NLA_U32 }, +}; + const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, @@ -214,7 +221,23 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_NAME_TABLE_GET, .dumpit = tipc_nl_name_table_dump, .policy = tipc_nl_policy, - } + }, + { + .cmd = TIPC_NL_MON_SET, + .doit = tipc_nl_node_set_monitor, + .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_MON_GET, + .doit = tipc_nl_node_get_monitor, + .dumpit = tipc_nl_node_dump_monitor, + .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_MON_PEER_GET, + .dumpit = tipc_nl_node_dump_monitor_peer, + .policy = tipc_nl_policy, + }, }; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index ed1dbcb4a..4ba0ad422 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -55,6 +55,7 @@ extern const struct nla_policy tipc_nl_prop_policy[]; extern const struct nla_policy tipc_nl_bearer_policy[]; extern const struct nla_policy tipc_nl_media_policy[]; extern const struct nla_policy tipc_nl_udp_policy[]; +extern const struct nla_policy tipc_nl_monitor_policy[]; int tipc_netlink_start(void); int tipc_netlink_compat_start(void); diff --git a/net/tipc/node.c b/net/tipc/node.c index 23d476184..21974191e 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -40,6 +40,7 @@ #include "name_distr.h" #include "socket.h" #include "bcast.h" +#include "monitor.h" #include "discover.h" #include "netlink.h" @@ -205,17 +206,6 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr) return caps; } -/* - * A trivial power-of-two bitmask technique is used for speed, since this - * operation is done for every incoming TIPC packet. The number of hash table - * entries has been chosen so that no hash chain exceeds 8 nodes and will - * usually be much smaller (typically only a single node). - */ -static unsigned int tipc_hashfn(u32 addr) -{ - return addr & (NODE_HTABLE_SIZE - 1); -} - static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); @@ -279,6 +269,7 @@ static void tipc_node_write_unlock(struct tipc_node *n) u32 addr = 0; u32 flags = n->action_flags; u32 link_id = 0; + u32 bearer_id; struct list_head *publ_list; if (likely(!flags)) { @@ -288,6 +279,7 @@ static void tipc_node_write_unlock(struct tipc_node *n) addr = n->addr; link_id = n->link_id; + bearer_id = link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | @@ -301,13 +293,16 @@ static void tipc_node_write_unlock(struct tipc_node *n) if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, addr); - if (flags & TIPC_NOTIFY_LINK_UP) + if (flags & TIPC_NOTIFY_LINK_UP) { + tipc_mon_peer_up(net, addr, bearer_id); tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr, TIPC_NODE_SCOPE, link_id, addr); - - if (flags & TIPC_NOTIFY_LINK_DOWN) + } + if (flags & TIPC_NOTIFY_LINK_DOWN) { + tipc_mon_peer_down(net, addr, bearer_id); tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, link_id, addr); + } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) @@ -378,14 +373,13 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; - unsigned long keepalive_intv = msecs_to_jiffies(intv); /* Link with lowest tolerance determines timer interval */ - if (keepalive_intv < n->keepalive_intv) - n->keepalive_intv = keepalive_intv; + if (intv < n->keepalive_intv) + n->keepalive_intv = intv; - /* Ensure link's abort limit corresponds to current interval */ - tipc_link_set_abort_limit(l, tol / jiffies_to_msecs(n->keepalive_intv)); + /* Ensure link's abort limit corresponds to current tolerance */ + tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete(struct tipc_node *node) @@ -526,7 +520,7 @@ static void tipc_node_timeout(unsigned long data) if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } - mod_timer(&n->timer, jiffies + n->keepalive_intv); + mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** @@ -692,6 +686,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) struct tipc_link *l = le->link; struct tipc_media_addr *maddr; struct sk_buff_head xmitq; + int old_bearer_id = bearer_id; if (!l) return; @@ -711,6 +706,8 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_link_fsm_evt(l, LINK_RESET_EVT); } tipc_node_write_unlock(n); + if (delete) + tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); tipc_sk_rcv(n->net, &le->inputq); } @@ -735,6 +732,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, bool accept_addr = false; bool reset = true; char *if_name; + unsigned long intv; *dupl_addr = false; *respond = false; @@ -840,9 +838,11 @@ void tipc_node_check_dest(struct net *net, u32 onode, le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); - if (n->link_cnt == 1) - if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + if (n->link_cnt == 1) { + intv = jiffies + msecs_to_jiffies(n->keepalive_intv); + if (!mod_timer(&n->timer, intv)) tipc_node_get(n); + } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: @@ -950,7 +950,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: - state = SELF_DOWN_PEER_LEAVING; + state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: @@ -969,7 +969,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: - state = SELF_LEAVING_PEER_DOWN; + state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: @@ -1928,3 +1928,168 @@ out: return skb->len; } + +int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; + struct net *net = sock_net(skb->sk); + int err; + + if (!info->attrs[TIPC_NLA_MON]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_MON_MAX, + info->attrs[TIPC_NLA_MON], + tipc_nl_monitor_policy); + if (err) + return err; + + if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { + u32 val; + + val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); + err = tipc_nl_monitor_set_threshold(net, val); + if (err) + return err; + } + + return 0; +} + +static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) +{ + struct nlattr *attrs; + void *hdr; + u32 val; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + 0, TIPC_NL_MON_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_MON); + if (!attrs) + goto msg_full; + + val = tipc_nl_monitor_get_threshold(net); + + if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_nl_msg msg; + int err; + + msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + err = __tipc_nl_add_monitor_prop(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + + return genlmsg_reply(msg.skb, info); +} + +int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + u32 prev_bearer = cb->args[0]; + struct tipc_nl_msg msg; + int err; + int i; + + if (prev_bearer == MAX_BEARERS) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rtnl_lock(); + for (i = prev_bearer; i < MAX_BEARERS; i++) { + prev_bearer = i; + err = __tipc_nl_add_monitor(net, &msg, prev_bearer); + if (err) + goto out; + } + +out: + rtnl_unlock(); + cb->args[0] = prev_bearer; + + return skb->len; +} + +int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + u32 prev_node = cb->args[1]; + u32 bearer_id = cb->args[2]; + int done = cb->args[0]; + struct tipc_nl_msg msg; + int err; + + if (!prev_node) { + struct nlattr **attrs; + struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; + + err = tipc_nlmsg_parse(cb->nlh, &attrs); + if (err) + return err; + + if (!attrs[TIPC_NLA_MON]) + return -EINVAL; + + err = nla_parse_nested(mon, TIPC_NLA_MON_MAX, + attrs[TIPC_NLA_MON], + tipc_nl_monitor_policy); + if (err) + return err; + + if (!mon[TIPC_NLA_MON_REF]) + return -EINVAL; + + bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); + + if (bearer_id >= MAX_BEARERS) + return -EINVAL; + } + + if (done) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rtnl_lock(); + err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); + if (!err) + done = 1; + + rtnl_unlock(); + cb->args[0] = done; + cb->args[1] = prev_node; + cb->args[2] = bearer_id; + + return skb->len; +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 8264b3d97..d69fdfcc0 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -78,4 +78,9 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb); +int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, + struct netlink_callback *cb); #endif diff --git a/net/tipc/server.c b/net/tipc/server.c index 272d20a79..215849ce4 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -418,13 +418,12 @@ static struct outqueue_entry *tipc_alloc_entry(void *data, int len) if (!entry) return NULL; - buf = kmalloc(len, GFP_ATOMIC); + buf = kmemdup(data, len, GFP_ATOMIC); if (!buf) { kfree(entry); return NULL; } - memcpy(buf, data, len); entry->iov.iov_base = buf; entry->iov.iov_len = len; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index c9cf2be36..ae7e14cae 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -63,7 +63,7 @@ */ struct udp_media_addr { __be16 proto; - __be16 udp_port; + __be16 port; union { struct in_addr ipv4; struct in6_addr ipv6; @@ -108,9 +108,9 @@ static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) struct udp_media_addr *ua = (struct udp_media_addr *)&a->value; if (ntohs(ua->proto) == ETH_P_IP) - snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->udp_port)); + snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port)); else if (ntohs(ua->proto) == ETH_P_IPV6) - snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->udp_port)); + snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port)); else pr_err("Invalid UDP media address\n"); return 0; @@ -178,8 +178,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, skb->dev = rt->dst.dev; ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, - dst->ipv4.s_addr, 0, ttl, 0, src->udp_port, - dst->udp_port, false, true); + dst->ipv4.s_addr, 0, ttl, 0, src->port, + dst->port, false, true); #if IS_ENABLED(CONFIG_IPV6) } else { struct dst_entry *ndst; @@ -196,8 +196,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, ndst->dev, &src->ipv6, - &dst->ipv6, 0, ttl, 0, src->udp_port, - dst->udp_port, false); + &dst->ipv6, 0, ttl, 0, src->port, + dst->port, false); #endif } return err; @@ -292,12 +292,12 @@ err: ip4 = (struct sockaddr_in *)&sa_local; local->proto = htons(ETH_P_IP); - local->udp_port = ip4->sin_port; + local->port = ip4->sin_port; local->ipv4.s_addr = ip4->sin_addr.s_addr; ip4 = (struct sockaddr_in *)&sa_remote; remote->proto = htons(ETH_P_IP); - remote->udp_port = ip4->sin_port; + remote->port = ip4->sin_port; remote->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; @@ -312,13 +312,13 @@ err: return -EINVAL; local->proto = htons(ETH_P_IPV6); - local->udp_port = ip6->sin6_port; + local->port = ip6->sin6_port; memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); ub->ifindex = ip6->sin6_scope_id; ip6 = (struct sockaddr_in6 *)&sa_remote; remote->proto = htons(ETH_P_IPV6); - remote->udp_port = ip6->sin6_port; + remote->port = ip6->sin6_port; memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); return 0; #endif @@ -386,7 +386,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, err = -EAFNOSUPPORT; goto err; } - udp_conf.local_udp_port = local.udp_port; + udp_conf.local_udp_port = local.port; err = udp_sock_create(net, &udp_conf, &ub->ubsock); if (err) goto err; @@ -396,10 +396,13 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); - if (enable_mcast(ub, remote)) + err = enable_mcast(ub, remote); + if (err) goto err; return 0; err: + if (ub->ubsock) + udp_tunnel_sock_release(ub->ubsock); kfree(ub); return err; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e444fa47e..8309687a5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -769,6 +769,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) lockdep_set_class(&sk->sk_receive_queue.lock, &af_unix_sk_receive_queue_lock_key); + sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 14810abed..8831e7c42 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -26,3 +26,23 @@ config VMWARE_VMCI_VSOCKETS To compile this driver as a module, choose M here: the module will be called vmw_vsock_vmci_transport. If unsure, say N. + +config VIRTIO_VSOCKETS + tristate "virtio transport for Virtual Sockets" + depends on VSOCKETS && VIRTIO + select VIRTIO_VSOCKETS_COMMON + help + This module implements a virtio transport for Virtual Sockets. + + Enable this transport if your Virtual Machine host supports Virtual + Sockets over virtio. + + To compile this driver as a module, choose M here: the module will be + called vmw_vsock_virtio_transport. If unsure, say N. + +config VIRTIO_VSOCKETS_COMMON + tristate + help + This option is selected by any driver which needs to access + the virtio_vsock. The module will be called + vmw_vsock_virtio_transport_common. diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index 2ce52d70f..bc27c70e0 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -1,7 +1,13 @@ obj-$(CONFIG_VSOCKETS) += vsock.o obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o +obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o +obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o vsock-y += af_vsock.o vsock_addr.o vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ vmci_transport_notify_qstate.o + +vmw_vsock_virtio_transport-y += virtio_transport.o + +vmw_vsock_virtio_transport_common-y += virtio_transport_common.o diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index b96ac918e..8a398b3fb 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -344,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk) return ret; } +void vsock_remove_sock(struct vsock_sock *vsk) +{ + if (vsock_in_bound_table(vsk)) + vsock_remove_bound(vsk); + + if (vsock_in_connected_table(vsk)) + vsock_remove_connected(vsk); +} +EXPORT_SYMBOL_GPL(vsock_remove_sock); + void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) { int i; @@ -455,6 +465,8 @@ void vsock_pending_work(struct work_struct *work) if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); + + listener->sk_ack_backlog--; } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We @@ -465,8 +477,6 @@ void vsock_pending_work(struct work_struct *work) goto out; } - listener->sk_ack_backlog--; - /* We need to remove ourself from the global connected sockets list so * incoming packets can't find this socket, and to reduce the reference * count. @@ -660,12 +670,6 @@ static void __vsock_release(struct sock *sk) vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ - if (vsock_in_bound_table(vsk)) - vsock_remove_bound(vsk); - - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); - transport->release(vsk); lock_sock(sk); @@ -1995,7 +1999,16 @@ void vsock_core_exit(void) } EXPORT_SYMBOL_GPL(vsock_core_exit); +const struct vsock_transport *vsock_core_get_transport(void) +{ + /* vsock_register_mutex not taken since only the transport uses this + * function and only while registered. + */ + return transport; +} +EXPORT_SYMBOL_GPL(vsock_core_get_transport); + MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); -MODULE_VERSION("1.0.1.0-k"); +MODULE_VERSION("1.0.2.0-k"); MODULE_LICENSE("GPL v2"); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c new file mode 100644 index 000000000..936d7eee6 --- /dev/null +++ b/net/vmw_vsock/virtio_transport.c @@ -0,0 +1,620 @@ +/* + * virtio transport for vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He + * Stefan Hajnoczi + * + * Some of the code is take from Gerd Hoffmann 's + * early virtio-vsock proof-of-concept bits. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct workqueue_struct *virtio_vsock_workqueue; +static struct virtio_vsock *the_virtio_vsock; +static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */ + +struct virtio_vsock { + struct virtio_device *vdev; + struct virtqueue *vqs[VSOCK_VQ_MAX]; + + /* Virtqueue processing is deferred to a workqueue */ + struct work_struct tx_work; + struct work_struct rx_work; + struct work_struct event_work; + + /* The following fields are protected by tx_lock. vqs[VSOCK_VQ_TX] + * must be accessed with tx_lock held. + */ + struct mutex tx_lock; + + struct work_struct send_pkt_work; + spinlock_t send_pkt_list_lock; + struct list_head send_pkt_list; + + atomic_t queued_replies; + + /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX] + * must be accessed with rx_lock held. + */ + struct mutex rx_lock; + int rx_buf_nr; + int rx_buf_max_nr; + + /* The following fields are protected by event_lock. + * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. + */ + struct mutex event_lock; + struct virtio_vsock_event event_list[8]; + + u32 guest_cid; +}; + +static struct virtio_vsock *virtio_vsock_get(void) +{ + return the_virtio_vsock; +} + +static u32 virtio_transport_get_local_cid(void) +{ + struct virtio_vsock *vsock = virtio_vsock_get(); + + return vsock->guest_cid; +} + +static void +virtio_transport_send_pkt_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, send_pkt_work); + struct virtqueue *vq; + bool added = false; + bool restart_rx = false; + + mutex_lock(&vsock->tx_lock); + + vq = vsock->vqs[VSOCK_VQ_TX]; + + for (;;) { + struct virtio_vsock_pkt *pkt; + struct scatterlist hdr, buf, *sgs[2]; + int ret, in_sg = 0, out_sg = 0; + bool reply; + + spin_lock_bh(&vsock->send_pkt_list_lock); + if (list_empty(&vsock->send_pkt_list)) { + spin_unlock_bh(&vsock->send_pkt_list_lock); + break; + } + + pkt = list_first_entry(&vsock->send_pkt_list, + struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + reply = pkt->reply; + + sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); + sgs[out_sg++] = &hdr; + if (pkt->buf) { + sg_init_one(&buf, pkt->buf, pkt->len); + sgs[out_sg++] = &buf; + } + + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL); + /* Usually this means that there is no more space available in + * the vq + */ + if (ret < 0) { + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + break; + } + + if (reply) { + struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; + int val; + + val = atomic_dec_return(&vsock->queued_replies); + + /* Do we now have resources to resume rx processing? */ + if (val + 1 == virtqueue_get_vring_size(rx_vq)) + restart_rx = true; + } + + added = true; + } + + if (added) + virtqueue_kick(vq); + + mutex_unlock(&vsock->tx_lock); + + if (restart_rx) + queue_work(virtio_vsock_workqueue, &vsock->rx_work); +} + +static int +virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock *vsock; + int len = pkt->len; + + vsock = virtio_vsock_get(); + if (!vsock) { + virtio_transport_free_pkt(pkt); + return -ENODEV; + } + + if (pkt->reply) + atomic_inc(&vsock->queued_replies); + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add_tail(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + return len; +} + +static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) +{ + int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; + struct virtio_vsock_pkt *pkt; + struct scatterlist hdr, buf, *sgs[2]; + struct virtqueue *vq; + int ret; + + vq = vsock->vqs[VSOCK_VQ_RX]; + + do { + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + break; + + pkt->buf = kmalloc(buf_len, GFP_KERNEL); + if (!pkt->buf) { + virtio_transport_free_pkt(pkt); + break; + } + + pkt->len = buf_len; + + sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); + sgs[0] = &hdr; + + sg_init_one(&buf, pkt->buf, buf_len); + sgs[1] = &buf; + ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL); + if (ret) { + virtio_transport_free_pkt(pkt); + break; + } + vsock->rx_buf_nr++; + } while (vq->num_free); + if (vsock->rx_buf_nr > vsock->rx_buf_max_nr) + vsock->rx_buf_max_nr = vsock->rx_buf_nr; + virtqueue_kick(vq); +} + +static void virtio_transport_tx_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, tx_work); + struct virtqueue *vq; + bool added = false; + + vq = vsock->vqs[VSOCK_VQ_TX]; + mutex_lock(&vsock->tx_lock); + do { + struct virtio_vsock_pkt *pkt; + unsigned int len; + + virtqueue_disable_cb(vq); + while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) { + virtio_transport_free_pkt(pkt); + added = true; + } + } while (!virtqueue_enable_cb(vq)); + mutex_unlock(&vsock->tx_lock); + + if (added) + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); +} + +/* Is there space left for replies to rx packets? */ +static bool virtio_transport_more_replies(struct virtio_vsock *vsock) +{ + struct virtqueue *vq = vsock->vqs[VSOCK_VQ_RX]; + int val; + + smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ + val = atomic_read(&vsock->queued_replies); + + return val < virtqueue_get_vring_size(vq); +} + +static void virtio_transport_rx_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, rx_work); + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_RX]; + + mutex_lock(&vsock->rx_lock); + + do { + virtqueue_disable_cb(vq); + for (;;) { + struct virtio_vsock_pkt *pkt; + unsigned int len; + + if (!virtio_transport_more_replies(vsock)) { + /* Stop rx until the device processes already + * pending replies. Leave rx virtqueue + * callbacks disabled. + */ + goto out; + } + + pkt = virtqueue_get_buf(vq, &len); + if (!pkt) { + break; + } + + vsock->rx_buf_nr--; + + /* Drop short/long packets */ + if (unlikely(len < sizeof(pkt->hdr) || + len > sizeof(pkt->hdr) + pkt->len)) { + virtio_transport_free_pkt(pkt); + continue; + } + + pkt->len = len - sizeof(pkt->hdr); + virtio_transport_recv_pkt(pkt); + } + } while (!virtqueue_enable_cb(vq)); + +out: + if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) + virtio_vsock_rx_fill(vsock); + mutex_unlock(&vsock->rx_lock); +} + +/* event_lock must be held */ +static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, + struct virtio_vsock_event *event) +{ + struct scatterlist sg; + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_EVENT]; + + sg_init_one(&sg, event, sizeof(*event)); + + return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL); +} + +/* event_lock must be held */ +static void virtio_vsock_event_fill(struct virtio_vsock *vsock) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(vsock->event_list); i++) { + struct virtio_vsock_event *event = &vsock->event_list[i]; + + virtio_vsock_event_fill_one(vsock, event); + } + + virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]); +} + +static void virtio_vsock_reset_sock(struct sock *sk) +{ + lock_sock(sk); + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = ECONNRESET; + sk->sk_error_report(sk); + release_sock(sk); +} + +static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock) +{ + struct virtio_device *vdev = vsock->vdev; + u64 guest_cid; + + vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid), + &guest_cid, sizeof(guest_cid)); + vsock->guest_cid = le64_to_cpu(guest_cid); +} + +/* event_lock must be held */ +static void virtio_vsock_event_handle(struct virtio_vsock *vsock, + struct virtio_vsock_event *event) +{ + switch (le32_to_cpu(event->id)) { + case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET: + virtio_vsock_update_guest_cid(vsock); + vsock_for_each_connected_socket(virtio_vsock_reset_sock); + break; + } +} + +static void virtio_transport_event_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, event_work); + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_EVENT]; + + mutex_lock(&vsock->event_lock); + + do { + struct virtio_vsock_event *event; + unsigned int len; + + virtqueue_disable_cb(vq); + while ((event = virtqueue_get_buf(vq, &len)) != NULL) { + if (len == sizeof(*event)) + virtio_vsock_event_handle(vsock, event); + + virtio_vsock_event_fill_one(vsock, event); + } + } while (!virtqueue_enable_cb(vq)); + + virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]); + + mutex_unlock(&vsock->event_lock); +} + +static void virtio_vsock_event_done(struct virtqueue *vq) +{ + struct virtio_vsock *vsock = vq->vdev->priv; + + if (!vsock) + return; + queue_work(virtio_vsock_workqueue, &vsock->event_work); +} + +static void virtio_vsock_tx_done(struct virtqueue *vq) +{ + struct virtio_vsock *vsock = vq->vdev->priv; + + if (!vsock) + return; + queue_work(virtio_vsock_workqueue, &vsock->tx_work); +} + +static void virtio_vsock_rx_done(struct virtqueue *vq) +{ + struct virtio_vsock *vsock = vq->vdev->priv; + + if (!vsock) + return; + queue_work(virtio_vsock_workqueue, &vsock->rx_work); +} + +static struct virtio_transport virtio_transport = { + .transport = { + .get_local_cid = virtio_transport_get_local_cid, + + .init = virtio_transport_do_socket_init, + .destruct = virtio_transport_destruct, + .release = virtio_transport_release, + .connect = virtio_transport_connect, + .shutdown = virtio_transport_shutdown, + + .dgram_bind = virtio_transport_dgram_bind, + .dgram_dequeue = virtio_transport_dgram_dequeue, + .dgram_enqueue = virtio_transport_dgram_enqueue, + .dgram_allow = virtio_transport_dgram_allow, + + .stream_dequeue = virtio_transport_stream_dequeue, + .stream_enqueue = virtio_transport_stream_enqueue, + .stream_has_data = virtio_transport_stream_has_data, + .stream_has_space = virtio_transport_stream_has_space, + .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, + .stream_is_active = virtio_transport_stream_is_active, + .stream_allow = virtio_transport_stream_allow, + + .notify_poll_in = virtio_transport_notify_poll_in, + .notify_poll_out = virtio_transport_notify_poll_out, + .notify_recv_init = virtio_transport_notify_recv_init, + .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, + .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, + .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, + .notify_send_init = virtio_transport_notify_send_init, + .notify_send_pre_block = virtio_transport_notify_send_pre_block, + .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, + .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, + + .set_buffer_size = virtio_transport_set_buffer_size, + .set_min_buffer_size = virtio_transport_set_min_buffer_size, + .set_max_buffer_size = virtio_transport_set_max_buffer_size, + .get_buffer_size = virtio_transport_get_buffer_size, + .get_min_buffer_size = virtio_transport_get_min_buffer_size, + .get_max_buffer_size = virtio_transport_get_max_buffer_size, + }, + + .send_pkt = virtio_transport_send_pkt, +}; + +static int virtio_vsock_probe(struct virtio_device *vdev) +{ + vq_callback_t *callbacks[] = { + virtio_vsock_rx_done, + virtio_vsock_tx_done, + virtio_vsock_event_done, + }; + static const char * const names[] = { + "rx", + "tx", + "event", + }; + struct virtio_vsock *vsock = NULL; + int ret; + + ret = mutex_lock_interruptible(&the_virtio_vsock_mutex); + if (ret) + return ret; + + /* Only one virtio-vsock device per guest is supported */ + if (the_virtio_vsock) { + ret = -EBUSY; + goto out; + } + + vsock = kzalloc(sizeof(*vsock), GFP_KERNEL); + if (!vsock) { + ret = -ENOMEM; + goto out; + } + + vsock->vdev = vdev; + + ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX, + vsock->vqs, callbacks, names); + if (ret < 0) + goto out; + + virtio_vsock_update_guest_cid(vsock); + + ret = vsock_core_init(&virtio_transport.transport); + if (ret < 0) + goto out_vqs; + + vsock->rx_buf_nr = 0; + vsock->rx_buf_max_nr = 0; + atomic_set(&vsock->queued_replies, 0); + + vdev->priv = vsock; + the_virtio_vsock = vsock; + mutex_init(&vsock->tx_lock); + mutex_init(&vsock->rx_lock); + mutex_init(&vsock->event_lock); + spin_lock_init(&vsock->send_pkt_list_lock); + INIT_LIST_HEAD(&vsock->send_pkt_list); + INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); + INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); + INIT_WORK(&vsock->event_work, virtio_transport_event_work); + INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); + + mutex_lock(&vsock->rx_lock); + virtio_vsock_rx_fill(vsock); + mutex_unlock(&vsock->rx_lock); + + mutex_lock(&vsock->event_lock); + virtio_vsock_event_fill(vsock); + mutex_unlock(&vsock->event_lock); + + mutex_unlock(&the_virtio_vsock_mutex); + return 0; + +out_vqs: + vsock->vdev->config->del_vqs(vsock->vdev); +out: + kfree(vsock); + mutex_unlock(&the_virtio_vsock_mutex); + return ret; +} + +static void virtio_vsock_remove(struct virtio_device *vdev) +{ + struct virtio_vsock *vsock = vdev->priv; + struct virtio_vsock_pkt *pkt; + + flush_work(&vsock->rx_work); + flush_work(&vsock->tx_work); + flush_work(&vsock->event_work); + flush_work(&vsock->send_pkt_work); + + vdev->config->reset(vdev); + + mutex_lock(&vsock->rx_lock); + while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) + virtio_transport_free_pkt(pkt); + mutex_unlock(&vsock->rx_lock); + + mutex_lock(&vsock->tx_lock); + while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) + virtio_transport_free_pkt(pkt); + mutex_unlock(&vsock->tx_lock); + + spin_lock_bh(&vsock->send_pkt_list_lock); + while (!list_empty(&vsock->send_pkt_list)) { + pkt = list_first_entry(&vsock->send_pkt_list, + struct virtio_vsock_pkt, list); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + mutex_lock(&the_virtio_vsock_mutex); + the_virtio_vsock = NULL; + vsock_core_exit(); + mutex_unlock(&the_virtio_vsock_mutex); + + vdev->config->del_vqs(vdev); + + kfree(vsock); +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features[] = { +}; + +static struct virtio_driver virtio_vsock_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtio_vsock_probe, + .remove = virtio_vsock_remove, +}; + +static int __init virtio_vsock_init(void) +{ + int ret; + + virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0); + if (!virtio_vsock_workqueue) + return -ENOMEM; + ret = register_virtio_driver(&virtio_vsock_driver); + if (ret) + destroy_workqueue(virtio_vsock_workqueue); + return ret; +} + +static void __exit virtio_vsock_exit(void) +{ + unregister_virtio_driver(&virtio_vsock_driver); + destroy_workqueue(virtio_vsock_workqueue); +} + +module_init(virtio_vsock_init); +module_exit(virtio_vsock_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("virtio transport for vsock"); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c new file mode 100644 index 000000000..a53b3a16b --- /dev/null +++ b/net/vmw_vsock/virtio_transport_common.c @@ -0,0 +1,992 @@ +/* + * common code for virtio vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +/* How long to wait for graceful shutdown of a connection */ +#define VSOCK_CLOSE_TIMEOUT (8 * HZ) + +static const struct virtio_transport *virtio_transport_get_ops(void) +{ + const struct vsock_transport *t = vsock_core_get_transport(); + + return container_of(t, struct virtio_transport, transport); +} + +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port) +{ + struct virtio_vsock_pkt *pkt; + int err; + + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return NULL; + + pkt->hdr.type = cpu_to_le16(info->type); + pkt->hdr.op = cpu_to_le16(info->op); + pkt->hdr.src_cid = cpu_to_le64(src_cid); + pkt->hdr.dst_cid = cpu_to_le64(dst_cid); + pkt->hdr.src_port = cpu_to_le32(src_port); + pkt->hdr.dst_port = cpu_to_le32(dst_port); + pkt->hdr.flags = cpu_to_le32(info->flags); + pkt->len = len; + pkt->hdr.len = cpu_to_le32(len); + pkt->reply = info->reply; + + if (info->msg && len > 0) { + pkt->buf = kmalloc(len, GFP_KERNEL); + if (!pkt->buf) + goto out_pkt; + err = memcpy_from_msg(pkt->buf, info->msg, len); + if (err) + goto out; + } + + trace_virtio_transport_alloc_pkt(src_cid, src_port, + dst_cid, dst_port, + len, + info->type, + info->op, + info->flags); + + return pkt; + +out: + kfree(pkt->buf); +out_pkt: + kfree(pkt); + return NULL; +} +EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt); + +static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info) +{ + u32 src_cid, src_port, dst_cid, dst_port; + struct virtio_vsock_sock *vvs; + struct virtio_vsock_pkt *pkt; + u32 pkt_len = info->pkt_len; + + src_cid = vm_sockets_get_local_cid(); + src_port = vsk->local_addr.svm_port; + if (!info->remote_cid) { + dst_cid = vsk->remote_addr.svm_cid; + dst_port = vsk->remote_addr.svm_port; + } else { + dst_cid = info->remote_cid; + dst_port = info->remote_port; + } + + vvs = vsk->trans; + + /* we can send less than pkt_len bytes */ + if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) + pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; + + /* virtio_transport_get_credit might return less than pkt_len credit */ + pkt_len = virtio_transport_get_credit(vvs, pkt_len); + + /* Do not send zero length OP_RW pkt */ + if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) + return pkt_len; + + pkt = virtio_transport_alloc_pkt(info, pkt_len, + src_cid, src_port, + dst_cid, dst_port); + if (!pkt) { + virtio_transport_put_credit(vvs, pkt_len); + return -ENOMEM; + } + + virtio_transport_inc_tx_pkt(vvs, pkt); + + return virtio_transport_get_ops()->send_pkt(pkt); +} + +static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, + struct virtio_vsock_pkt *pkt) +{ + vvs->rx_bytes += pkt->len; +} + +static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, + struct virtio_vsock_pkt *pkt) +{ + vvs->rx_bytes -= pkt->len; + vvs->fwd_cnt += pkt->len; +} + +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) +{ + spin_lock_bh(&vvs->tx_lock); + pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); + pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); + spin_unlock_bh(&vvs->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); + +u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) +{ + u32 ret; + + spin_lock_bh(&vvs->tx_lock); + ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + if (ret > credit) + ret = credit; + vvs->tx_cnt += ret; + spin_unlock_bh(&vvs->tx_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_credit); + +void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) +{ + spin_lock_bh(&vvs->tx_lock); + vvs->tx_cnt -= credit; + spin_unlock_bh(&vvs->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_put_credit); + +static int virtio_transport_send_credit_update(struct vsock_sock *vsk, + int type, + struct virtio_vsock_hdr *hdr) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, + .type = type, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +static ssize_t +virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + while (total < len && !list_empty(&vvs->rx_queue)) { + pkt = list_first_entry(&vvs->rx_queue, + struct virtio_vsock_pkt, list); + + bytes = len - total; + if (bytes > pkt->len - pkt->off) + bytes = pkt->len - pkt->off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; + pkt->off += bytes; + if (pkt->off == pkt->len) { + virtio_transport_dec_rx_pkt(vvs, pkt); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + } + spin_unlock_bh(&vvs->rx_lock); + + /* Send a credit pkt to peer */ + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, + NULL); + + return total; + +out: + if (total) + err = total; + return err; +} + +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + + return virtio_transport_stream_do_dequeue(vsk, msg, len); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); + +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + spin_lock_bh(&vvs->rx_lock); + bytes = vvs->rx_bytes; + spin_unlock_bh(&vvs->rx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); + +static s64 virtio_transport_has_space(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + if (bytes < 0) + bytes = 0; + + return bytes; +} + +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + spin_lock_bh(&vvs->tx_lock); + bytes = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk) +{ + struct virtio_vsock_sock *vvs; + + vvs = kzalloc(sizeof(*vvs), GFP_KERNEL); + if (!vvs) + return -ENOMEM; + + vsk->trans = vvs; + vvs->vsk = vsk; + if (psk) { + struct virtio_vsock_sock *ptrans = psk->trans; + + vvs->buf_size = ptrans->buf_size; + vvs->buf_size_min = ptrans->buf_size_min; + vvs->buf_size_max = ptrans->buf_size_max; + vvs->peer_buf_alloc = ptrans->peer_buf_alloc; + } else { + vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; + vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; + vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; + } + + vvs->buf_alloc = vvs->buf_size; + + spin_lock_init(&vvs->rx_lock); + spin_lock_init(&vvs->tx_lock); + INIT_LIST_HEAD(&vvs->rx_queue); + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); + +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); + +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size_min; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); + +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size_max; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); + +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < vvs->buf_size_min) + vvs->buf_size_min = val; + if (val > vvs->buf_size_max) + vvs->buf_size_max = val; + vvs->buf_size = val; + vvs->buf_alloc = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); + +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val > vvs->buf_size) + vvs->buf_size = val; + vvs->buf_size_min = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); + +void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < vvs->buf_size) + vvs->buf_size = val; + vvs->buf_size_max = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); + +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now) +{ + if (vsock_stream_has_data(vsk)) + *data_ready_now = true; + else + *data_ready_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in); + +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_avail_now) +{ + s64 free_space; + + free_space = vsock_stream_has_space(vsk); + if (free_space > 0) + *space_avail_now = true; + else if (free_space == 0) + *space_avail_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init); + +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block); + +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue); + +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue); + +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init); + +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block); + +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue); + +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); + +bool virtio_transport_stream_is_active(struct vsock_sock *vsk) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); + +bool virtio_transport_stream_allow(u32 cid, u32 port) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); + +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); + +bool virtio_transport_dgram_allow(u32 cid, u32 port) +{ + return false; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow); + +int virtio_transport_connect(struct vsock_sock *vsk) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_REQUEST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_connect); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_SHUTDOWN, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .flags = (mode & RCV_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | + (mode & SEND_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_SEND : 0), + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_shutdown); + +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t dgram_len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RW, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .msg = msg, + .pkt_len = len, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue); + +void virtio_transport_destruct(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + kfree(vvs); +} +EXPORT_SYMBOL_GPL(virtio_transport_destruct); + +static int virtio_transport_reset(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .reply = !!pkt, + }; + + /* Send RST only if the original pkt is not a RST pkt */ + if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +/* Normally packets are associated with a socket. There may be no socket if an + * attempt was made to connect to a socket that does not exist. + */ +static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = le16_to_cpu(pkt->hdr.type), + .reply = true, + }; + + /* Send RST only if the original pkt is not a RST pkt */ + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + pkt = virtio_transport_alloc_pkt(&info, 0, + le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port), + le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + if (!pkt) + return -ENOMEM; + + return virtio_transport_get_ops()->send_pkt(pkt); +} + +static void virtio_transport_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, + sock_flag(sk, SOCK_DONE))) + break; + } while (!signal_pending(current) && timeout); + + finish_wait(sk_sleep(sk), &wait); + } +} + +static void virtio_transport_do_close(struct vsock_sock *vsk, + bool cancel_timeout) +{ + struct sock *sk = sk_vsock(vsk); + + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + sk->sk_state_change(sk); + + if (vsk->close_work_scheduled && + (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { + vsk->close_work_scheduled = false; + + vsock_remove_sock(vsk); + + /* Release refcnt obtained when we scheduled the timeout */ + sock_put(sk); + } +} + +static void virtio_transport_close_timeout(struct work_struct *work) +{ + struct vsock_sock *vsk = + container_of(work, struct vsock_sock, close_work.work); + struct sock *sk = sk_vsock(vsk); + + sock_hold(sk); + lock_sock(sk); + + if (!sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset(vsk, NULL); + + virtio_transport_do_close(vsk, false); + } + + vsk->close_work_scheduled = false; + + release_sock(sk); + sock_put(sk); +} + +/* User context, vsk->sk is locked */ +static bool virtio_transport_close(struct vsock_sock *vsk) +{ + struct sock *sk = &vsk->sk; + + if (!(sk->sk_state == SS_CONNECTED || + sk->sk_state == SS_DISCONNECTING)) + return true; + + /* Already received SHUTDOWN from peer, reply with RST */ + if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) { + (void)virtio_transport_reset(vsk, NULL); + return true; + } + + if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) + (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); + + if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) + virtio_transport_wait_close(sk, sk->sk_lingertime); + + if (sock_flag(sk, SOCK_DONE)) { + return true; + } + + sock_hold(sk); + INIT_DELAYED_WORK(&vsk->close_work, + virtio_transport_close_timeout); + vsk->close_work_scheduled = true; + schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT); + return false; +} + +void virtio_transport_release(struct vsock_sock *vsk) +{ + struct sock *sk = &vsk->sk; + bool remove_sock = true; + + lock_sock(sk); + if (sk->sk_type == SOCK_STREAM) + remove_sock = virtio_transport_close(vsk); + release_sock(sk); + + if (remove_sock) + vsock_remove_sock(vsk); +} +EXPORT_SYMBOL_GPL(virtio_transport_release); + +static int +virtio_transport_recv_connecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + int err; + int skerr; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RESPONSE: + sk->sk_state = SS_CONNECTED; + sk->sk_socket->state = SS_CONNECTED; + vsock_insert_connected(vsk); + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_INVALID: + break; + case VIRTIO_VSOCK_OP_RST: + skerr = ECONNRESET; + err = 0; + goto destroy; + default: + skerr = EPROTO; + err = -EINVAL; + goto destroy; + } + return 0; + +destroy: + virtio_transport_reset(vsk, pkt); + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = skerr; + sk->sk_error_report(sk); + return err; +} + +static int +virtio_transport_recv_connected(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + int err = 0; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RW: + pkt->len = le32_to_cpu(pkt->hdr.len); + pkt->off = 0; + + spin_lock_bh(&vvs->rx_lock); + virtio_transport_inc_rx_pkt(vvs, pkt); + list_add_tail(&pkt->list, &vvs->rx_queue); + spin_unlock_bh(&vvs->rx_lock); + + sk->sk_data_ready(sk); + return err; + case VIRTIO_VSOCK_OP_CREDIT_UPDATE: + sk->sk_write_space(sk); + break; + case VIRTIO_VSOCK_OP_SHUTDOWN: + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + vsk->peer_shutdown |= RCV_SHUTDOWN; + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + vsk->peer_shutdown |= SEND_SHUTDOWN; + if (vsk->peer_shutdown == SHUTDOWN_MASK && + vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + if (le32_to_cpu(pkt->hdr.flags)) + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_RST: + virtio_transport_do_close(vsk, true); + break; + default: + err = -EINVAL; + break; + } + + virtio_transport_free_pkt(pkt); + return err; +} + +static void +virtio_transport_recv_disconnecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + virtio_transport_do_close(vsk, true); +} + +static int +virtio_transport_send_response(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RESPONSE, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .remote_cid = le32_to_cpu(pkt->hdr.src_cid), + .remote_port = le32_to_cpu(pkt->hdr.src_port), + .reply = true, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +/* Handle server socket */ +static int +virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_sock *vchild; + struct sock *child; + + if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { + virtio_transport_reset(vsk, pkt); + return -EINVAL; + } + + if (sk_acceptq_is_full(sk)) { + virtio_transport_reset(vsk, pkt); + return -ENOMEM; + } + + child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, + sk->sk_type, 0); + if (!child) { + virtio_transport_reset(vsk, pkt); + return -ENOMEM; + } + + sk->sk_ack_backlog++; + + lock_sock_nested(child, SINGLE_DEPTH_NESTING); + + child->sk_state = SS_CONNECTED; + + vchild = vsock_sk(child); + vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + + vsock_insert_connected(vchild); + vsock_enqueue_accept(sk, child); + virtio_transport_send_response(vchild, pkt); + + release_sock(child); + + sk->sk_data_ready(sk); + return 0; +} + +static bool virtio_transport_space_update(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + bool space_available; + + /* buf_alloc and fwd_cnt is always included in the hdr */ + spin_lock_bh(&vvs->tx_lock); + vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + space_available = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + return space_available; +} + +/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex + * lock. + */ +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) +{ + struct sockaddr_vm src, dst; + struct vsock_sock *vsk; + struct sock *sk; + bool space_available; + + vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + + trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, + dst.svm_cid, dst.svm_port, + le32_to_cpu(pkt->hdr.len), + le16_to_cpu(pkt->hdr.type), + le16_to_cpu(pkt->hdr.op), + le32_to_cpu(pkt->hdr.flags), + le32_to_cpu(pkt->hdr.buf_alloc), + le32_to_cpu(pkt->hdr.fwd_cnt)); + + if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { + (void)virtio_transport_reset_no_sock(pkt); + goto free_pkt; + } + + /* The socket must be in connected or bound table + * otherwise send reset back + */ + sk = vsock_find_connected_socket(&src, &dst); + if (!sk) { + sk = vsock_find_bound_socket(&dst); + if (!sk) { + (void)virtio_transport_reset_no_sock(pkt); + goto free_pkt; + } + } + + vsk = vsock_sk(sk); + + space_available = virtio_transport_space_update(sk, pkt); + + lock_sock(sk); + + /* Update CID in case it has changed after a transport reset event */ + vsk->local_addr.svm_cid = dst.svm_cid; + + if (space_available) + sk->sk_write_space(sk); + + switch (sk->sk_state) { + case VSOCK_SS_LISTEN: + virtio_transport_recv_listen(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTING: + virtio_transport_recv_connecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTED: + virtio_transport_recv_connected(sk, pkt); + break; + case SS_DISCONNECTING: + virtio_transport_recv_disconnecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + default: + virtio_transport_free_pkt(pkt); + break; + } + release_sock(sk); + + /* Release refcnt obtained when we fetched this socket out of the + * bound or connected list. + */ + sock_put(sk); + return; + +free_pkt: + virtio_transport_free_pkt(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); + +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) +{ + kfree(pkt->buf); + kfree(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("common code for virtio vsock"); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 4120b7a53..4be4fbbc0 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1644,6 +1644,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk) static void vmci_transport_release(struct vsock_sock *vsk) { + vsock_remove_sock(vsk); + if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) { vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle); vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index da49c0b1f..0f506220a 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -513,6 +513,7 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, r = cfg80211_get_chans_dfs_available(wiphy, chandef->center_freq2, width); + break; default: WARN_ON(chandef->center_freq2); break; @@ -715,7 +716,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, ASSERT_RTNL(); - if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) || + if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) || !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) return false; diff --git a/net/wireless/core.c b/net/wireless/core.c index ecca3896b..7645e9736 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -220,7 +220,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) - rdev->scan_req->aborted = true; + rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } } @@ -748,6 +748,36 @@ int wiphy_register(struct wiphy *wiphy) nl80211_send_reg_change_event(&request); } + /* Check that nobody globally advertises any capabilities they do not + * advertise on all possible interface types. + */ + if (wiphy->extended_capabilities_len && + wiphy->num_iftype_ext_capab && + wiphy->iftype_ext_capab) { + u8 supported_on_all, j; + const struct wiphy_iftype_ext_capab *capab; + + capab = wiphy->iftype_ext_capab; + for (j = 0; j < wiphy->extended_capabilities_len; j++) { + if (capab[0].extended_capabilities_len > j) + supported_on_all = + capab[0].extended_capabilities[j]; + else + supported_on_all = 0x00; + for (i = 1; i < wiphy->num_iftype_ext_capab; i++) { + if (j >= capab[i].extended_capabilities_len) { + supported_on_all = 0x00; + break; + } + supported_on_all &= + capab[i].extended_capabilities[j]; + } + if (WARN_ON(wiphy->extended_capabilities[j] & + ~supported_on_all)) + break; + } + } + rdev->wiphy.registered = true; rtnl_unlock(); @@ -1057,7 +1087,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) - rdev->scan_req->aborted = true; + rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } diff --git a/net/wireless/core.h b/net/wireless/core.h index 025b7a5d5..eee914439 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -141,6 +141,18 @@ struct cfg80211_internal_bss { unsigned long refcount; atomic_t hold; + /* time at the start of the reception of the first octet of the + * timestamp field of the last beacon/probe received for this BSS. + * The time is the TSF of the BSS specified by %parent_bssid. + */ + u64 parent_tsf; + + /* the BSS according to which %parent_tsf is set. This is set to + * the BSS that the interface that requested the scan was connected to + * when the beacon/probe was received. + */ + u8 parent_bssid[ETH_ALEN] __aligned(2); + /* must be last because of priv member */ struct cfg80211_bss pub; }; @@ -214,7 +226,7 @@ struct cfg80211_event { size_t req_ie_len; size_t resp_ie_len; struct cfg80211_bss *bss; - u16 status; + int status; /* -1 = failed; 0..65535 = status code */ } cr; struct { const u8 *req_ie; @@ -374,7 +386,7 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - u16 status, bool wextev, + int status, bool wextev, struct cfg80211_bss *bss); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7d38dd697..4809f4d2c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -167,6 +167,7 @@ __cfg80211_rdev_from_attrs(struct net *netns, struct nlattr **attrs) if (attrs[NL80211_ATTR_IFINDEX]) { int ifindex = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]); + netdev = __dev_get_by_index(netns, ifindex); if (netdev) { if (netdev->ieee80211_ptr) @@ -404,6 +405,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PBSS] = { .type = NLA_FLAG }, [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED }, [NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 }, + [NL80211_ATTR_MU_MIMO_GROUP_DATA] = { + .len = VHT_MUMIMO_GROUPS_DATA_LEN + }, + [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, }; /* policy for the key attributes */ @@ -731,6 +736,7 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) if (tb[NL80211_KEY_DEFAULT_TYPES]) { struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; + err = nla_parse_nested(kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1, tb[NL80211_KEY_DEFAULT_TYPES], nl80211_key_default_policy); @@ -1264,7 +1270,7 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg, struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; - long split_start, band_start, chan_start; + long split_start, band_start, chan_start, capa_start; bool split; }; @@ -1382,6 +1388,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->ops->get_antenna) { u32 tx_ant = 0, rx_ant = 0; int res; + res = rdev_get_antenna(rdev, &tx_ant, &rx_ant); if (!res) { if (nla_put_u32(msg, @@ -1761,6 +1768,47 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, nla_nest_end(msg, nested); } + state->split_start++; + break; + case 13: + if (rdev->wiphy.num_iftype_ext_capab && + rdev->wiphy.iftype_ext_capab) { + struct nlattr *nested_ext_capab, *nested; + + nested = nla_nest_start(msg, + NL80211_ATTR_IFTYPE_EXT_CAPA); + if (!nested) + goto nla_put_failure; + + for (i = state->capa_start; + i < rdev->wiphy.num_iftype_ext_capab; i++) { + const struct wiphy_iftype_ext_capab *capab; + + capab = &rdev->wiphy.iftype_ext_capab[i]; + + nested_ext_capab = nla_nest_start(msg, i); + if (!nested_ext_capab || + nla_put_u32(msg, NL80211_ATTR_IFTYPE, + capab->iftype) || + nla_put(msg, NL80211_ATTR_EXT_CAPA, + capab->extended_capabilities_len, + capab->extended_capabilities) || + nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK, + capab->extended_capabilities_len, + capab->extended_capabilities_mask)) + goto nla_put_failure; + + nla_nest_end(msg, nested_ext_capab); + if (state->split) + break; + } + nla_nest_end(msg, nested); + if (i < rdev->wiphy.num_iftype_ext_capab) { + state->capa_start = i + 1; + break; + } + } + /* done */ state->split_start = 0; break; @@ -2116,7 +2164,6 @@ static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) return rdev_set_wds_peer(rdev, dev, bssid); } - static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev; @@ -2251,6 +2298,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] && info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]) { u32 tx_ant, rx_ant; + if ((!rdev->wiphy.available_antennas_tx && !rdev->wiphy.available_antennas_rx) || !rdev->ops->set_antenna) @@ -2651,6 +2699,38 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) change = true; } + if (info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]) { + const u8 *mumimo_groups; + u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) + return -EOPNOTSUPP; + + mumimo_groups = + nla_data(info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]); + + /* bits 0 and 63 are reserved and must be zero */ + if ((mumimo_groups[0] & BIT(7)) || + (mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN - 1] & BIT(0))) + return -EINVAL; + + memcpy(params.vht_mumimo_groups, mumimo_groups, + VHT_MUMIMO_GROUPS_DATA_LEN); + change = true; + } + + if (info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]) { + u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) + return -EOPNOTSUPP; + + nla_memcpy(params.macaddr, + info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR], + ETH_ALEN); + change = true; + } + if (flags && (*flags & MONITOR_FLAG_ACTIVE) && !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) return -EOPNOTSUPP; @@ -2919,6 +2999,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) pairwise = !!mac_addr; if (info->attrs[NL80211_ATTR_KEY_TYPE]) { u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); + if (kt >= NUM_NL80211_KEYTYPES) return -EINVAL; if (kt != NL80211_KEYTYPE_GROUP && @@ -3962,7 +4043,6 @@ static int nl80211_dump_station(struct sk_buff *skb, sta_idx++; } - out: cb->args[2] = sta_idx; err = skb->len; @@ -4366,6 +4446,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]); if (params.plink_state >= NUM_NL80211_PLINK_STATES) return -EINVAL; + if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) { + params.peer_aid = nla_get_u16( + info->attrs[NL80211_ATTR_MESH_PEER_AID]); + if (params.peer_aid > IEEE80211_MAX_AID) + return -EINVAL; + } params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE; } @@ -4763,7 +4849,6 @@ static int nl80211_dump_mpath(struct sk_buff *skb, path_idx++; } - out: cb->args[2] = path_idx; err = skb->len; @@ -5053,7 +5138,6 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) enum nl80211_user_reg_hint_type user_reg_hint_type; u32 owner_nlportid; - /* * You should only get this when cfg80211 hasn't yet initialized * completely when built-in to the kernel right between the time @@ -5245,24 +5329,68 @@ static const struct nla_policy [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG }, }; +static int nl80211_check_bool(const struct nlattr *nla, u8 min, u8 max, bool *out) +{ + u8 val = nla_get_u8(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + +static int nl80211_check_u8(const struct nlattr *nla, u8 min, u8 max, u8 *out) +{ + u8 val = nla_get_u8(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + +static int nl80211_check_u16(const struct nlattr *nla, u16 min, u16 max, u16 *out) +{ + u16 val = nla_get_u16(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + +static int nl80211_check_u32(const struct nlattr *nla, u32 min, u32 max, u32 *out) +{ + u32 val = nla_get_u32(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + +static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *out) +{ + s32 val = nla_get_s32(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + static int nl80211_parse_mesh_config(struct genl_info *info, struct mesh_config *cfg, u32 *mask_out) { struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1]; u32 mask = 0; + u16 ht_opmode; #define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, min, max, mask, attr, fn) \ do { \ if (tb[attr]) { \ - if (fn(tb[attr]) < min || fn(tb[attr]) > max) \ + if (fn(tb[attr], min, max, &cfg->param)) \ return -EINVAL; \ - cfg->param = fn(tb[attr]); \ mask |= (1 << (attr - 1)); \ } \ } while (0) - if (!info->attrs[NL80211_ATTR_MESH_CONFIG]) return -EINVAL; if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX, @@ -5277,99 +5405,126 @@ do { \ /* Fill in the params struct */ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, 1, 255, mask, NL80211_MESHCONF_RETRY_TIMEOUT, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, 1, 255, mask, NL80211_MESHCONF_CONFIRM_TIMEOUT, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, 1, 255, mask, NL80211_MESHCONF_HOLDING_TIMEOUT, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, 0, 255, mask, NL80211_MESHCONF_MAX_PEER_LINKS, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, 0, 16, mask, NL80211_MESHCONF_MAX_RETRIES, - nla_get_u8); + nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, 1, 255, - mask, NL80211_MESHCONF_TTL, nla_get_u8); + mask, NL80211_MESHCONF_TTL, nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, 1, 255, mask, NL80211_MESHCONF_ELEMENT_TTL, - nla_get_u8); + nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, 0, 1, mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS, - nla_get_u8); + nl80211_check_bool); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor, 1, 255, mask, NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, 0, 255, mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, - nla_get_u8); + nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, 1, 65535, mask, NL80211_MESHCONF_PATH_REFRESH_TIME, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, 1, 65535, mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout, 1, 65535, mask, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPnetDiameterTraversalTime, 1, 65535, mask, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, 0, 4, mask, NL80211_MESHCONF_HWMP_ROOTMODE, - nla_get_u8); + nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_RANN_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshGateAnnouncementProtocol, 0, 1, mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS, - nla_get_u8); + nl80211_check_bool); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, 0, 1, mask, NL80211_MESHCONF_FORWARDING, - nla_get_u8); + nl80211_check_bool); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, -255, 0, mask, NL80211_MESHCONF_RSSI_THRESHOLD, - nla_get_s32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, ht_opmode, 0, 16, - mask, NL80211_MESHCONF_HT_OPMODE, - nla_get_u16); + nl80211_check_s32); + /* + * Check HT operation mode based on + * IEEE 802.11 2012 8.4.2.59 HT Operation element. + */ + if (tb[NL80211_MESHCONF_HT_OPMODE]) { + ht_opmode = nla_get_u16(tb[NL80211_MESHCONF_HT_OPMODE]); + + if (ht_opmode & ~(IEEE80211_HT_OP_MODE_PROTECTION | + IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT | + IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) + return -EINVAL; + + if ((ht_opmode & IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT) && + (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) + return -EINVAL; + + switch (ht_opmode & IEEE80211_HT_OP_MODE_PROTECTION) { + case IEEE80211_HT_OP_MODE_PROTECTION_NONE: + case IEEE80211_HT_OP_MODE_PROTECTION_20MHZ: + if (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT) + return -EINVAL; + break; + case IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER: + case IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED: + if (!(ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) + return -EINVAL; + break; + } + cfg->ht_opmode = ht_opmode; + } FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout, 1, 65535, mask, NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_ROOT_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPconfirmationInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, NL80211_MESH_POWER_ACTIVE, NL80211_MESH_POWER_MAX, mask, NL80211_MESHCONF_POWER_MODE, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 0, 65535, mask, - NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); + NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff, mask, NL80211_MESHCONF_PLINK_TIMEOUT, - nla_get_u32); + nl80211_check_u32); if (mask_out) *mask_out = mask; @@ -5409,7 +5564,6 @@ static int nl80211_parse_mesh_setup(struct genl_info *info, IEEE80211_PATH_METRIC_VENDOR : IEEE80211_PATH_METRIC_AIRTIME; - if (tb[NL80211_MESH_SETUP_IE]) { struct nlattr *ieattr = tb[NL80211_MESH_SETUP_IE]; @@ -5796,10 +5950,8 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) } } - r = set_regdom(rd, REGD_SOURCE_CRDA); - /* set_regdom took ownership */ - rd = NULL; - + /* set_regdom takes ownership of rd */ + return set_regdom(rd, REGD_SOURCE_CRDA); bad_reg: kfree(rd); return r; @@ -6033,6 +6185,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) /* all channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; + if (!wiphy->bands[band]) continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { @@ -6104,6 +6257,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) { + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SET_SCAN_DWELL)) { + err = -EOPNOTSUPP; + goto out_free; + } + + request->duration = + nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]); + request->duration_mandatory = + nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]); + } + if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) { request->flags = nla_get_u32( info->attrs[NL80211_ATTR_SCAN_FLAGS]); @@ -6442,6 +6608,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, /* all channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; + if (!wiphy->bands[band]) continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { @@ -6511,7 +6678,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, nla_data(ssid), nla_len(ssid)); request->match_sets[i].ssid.ssid_len = nla_len(ssid); - /* special attribute - old implemenation w/a */ + /* special attribute - old implementation w/a */ request->match_sets[i].rssi_thold = default_match_rssi; rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI]; @@ -6936,6 +7103,13 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, jiffies_to_msecs(jiffies - intbss->ts))) goto nla_put_failure; + if (intbss->parent_tsf && + (nla_put_u64_64bit(msg, NL80211_BSS_PARENT_TSF, + intbss->parent_tsf, NL80211_BSS_PAD) || + nla_put(msg, NL80211_BSS_PARENT_BSSID, ETH_ALEN, + intbss->parent_bssid))) + goto nla_put_failure; + if (intbss->ts_boottime && nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME, intbss->ts_boottime, NL80211_BSS_PAD)) @@ -7204,6 +7378,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (key.idx >= 0) { int i; bool ok = false; + for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) { if (key.p.cipher == rdev->wiphy.cipher_suites[i]) { ok = true; @@ -7282,6 +7457,7 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) { u16 proto; + proto = nla_get_u16( info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]); settings->control_port_ethertype = cpu_to_be16(proto); @@ -8435,6 +8611,7 @@ static u32 rateset_to_mask(struct ieee80211_supported_band *sband, for (i = 0; i < rates_len; i++) { int rate = (rates[i] & 0x7f) * 5; int ridx; + for (ridx = 0; ridx < sband->n_bitrates; ridx++) { struct ieee80211_rate *srate = &sband->bitrates[ridx]; @@ -8743,7 +8920,6 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) if (params.wait < NL80211_MIN_REMAIN_ON_CHANNEL_TIME || params.wait > rdev->wiphy.max_remain_on_channel_duration) return -EINVAL; - } params.offchan = info->attrs[NL80211_ATTR_OFFCHANNEL_TX_OK]; @@ -10590,7 +10766,6 @@ int cfg80211_vendor_cmd_reply(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(cfg80211_vendor_cmd_reply); - static int nl80211_set_qos_map(struct sk_buff *skb, struct genl_info *info) { @@ -10945,7 +11120,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_WIPHY, .doit = nl80211_set_wiphy, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_RTNL, }, { @@ -10961,7 +11136,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_INTERFACE, .doit = nl80211_set_interface, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -10969,7 +11144,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_NEW_INTERFACE, .doit = nl80211_new_interface, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -10977,7 +11152,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_INTERFACE, .doit = nl80211_del_interface, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, @@ -10985,7 +11160,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_GET_KEY, .doit = nl80211_get_key, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -10993,7 +11168,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_KEY, .doit = nl80211_set_key, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, @@ -11002,7 +11177,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_NEW_KEY, .doit = nl80211_new_key, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, @@ -11011,14 +11186,14 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_KEY, .doit = nl80211_del_key, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_BEACON, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_set_beacon, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -11026,7 +11201,7 @@ static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_START_AP, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_start_ap, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -11034,7 +11209,7 @@ static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_STOP_AP, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_stop_ap, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -11051,7 +11226,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_STATION, .doit = nl80211_set_station, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11059,7 +11234,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_NEW_STATION, .doit = nl80211_new_station, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11067,7 +11242,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_STATION, .doit = nl80211_del_station, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11076,7 +11251,7 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_get_mpath, .dumpit = nl80211_dump_mpath, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11085,7 +11260,7 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_get_mpp, .dumpit = nl80211_dump_mpp, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11093,7 +11268,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_MPATH, .doit = nl80211_set_mpath, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11101,7 +11276,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_NEW_MPATH, .doit = nl80211_new_mpath, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11109,7 +11284,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_MPATH, .doit = nl80211_del_mpath, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11117,7 +11292,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_BSS, .doit = nl80211_set_bss, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11156,7 +11331,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_MESH_CONFIG, .doit = nl80211_update_mesh_config, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11164,7 +11339,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TRIGGER_SCAN, .doit = nl80211_trigger_scan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11172,7 +11347,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_ABORT_SCAN, .doit = nl80211_abort_scan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11185,7 +11360,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_START_SCHED_SCAN, .doit = nl80211_start_sched_scan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11193,7 +11368,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_STOP_SCHED_SCAN, .doit = nl80211_stop_sched_scan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11201,7 +11376,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_AUTHENTICATE, .doit = nl80211_authenticate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, @@ -11210,7 +11385,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_ASSOCIATE, .doit = nl80211_associate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11218,7 +11393,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEAUTHENTICATE, .doit = nl80211_deauthenticate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11226,7 +11401,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DISASSOCIATE, .doit = nl80211_disassociate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11234,7 +11409,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_JOIN_IBSS, .doit = nl80211_join_ibss, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11242,7 +11417,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_LEAVE_IBSS, .doit = nl80211_leave_ibss, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11252,7 +11427,7 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_testmode_do, .dumpit = nl80211_testmode_dump, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11261,7 +11436,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CONNECT, .doit = nl80211_connect, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11269,7 +11444,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DISCONNECT, .doit = nl80211_disconnect, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11277,7 +11452,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_WIPHY_NETNS, .doit = nl80211_wiphy_netns, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11290,7 +11465,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_PMKSA, .doit = nl80211_setdel_pmksa, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11298,7 +11473,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_PMKSA, .doit = nl80211_setdel_pmksa, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11306,7 +11481,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_FLUSH_PMKSA, .doit = nl80211_flush_pmksa, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11314,7 +11489,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, .doit = nl80211_remain_on_channel, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11322,7 +11497,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, .doit = nl80211_cancel_remain_on_channel, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11330,7 +11505,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, .doit = nl80211_set_tx_bitrate_mask, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11338,7 +11513,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_REGISTER_FRAME, .doit = nl80211_register_mgmt, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11346,7 +11521,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_FRAME, .doit = nl80211_tx_mgmt, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11354,7 +11529,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_FRAME_WAIT_CANCEL, .doit = nl80211_tx_mgmt_cancel_wait, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11362,7 +11537,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_POWER_SAVE, .doit = nl80211_set_power_save, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11378,7 +11553,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_CQM, .doit = nl80211_set_cqm, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11386,7 +11561,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_CHANNEL, .doit = nl80211_set_channel, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11394,7 +11569,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_WDS_PEER, .doit = nl80211_set_wds_peer, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11402,7 +11577,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_JOIN_MESH, .doit = nl80211_join_mesh, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11410,7 +11585,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_LEAVE_MESH, .doit = nl80211_leave_mesh, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11418,7 +11593,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_JOIN_OCB, .doit = nl80211_join_ocb, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11426,7 +11601,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_LEAVE_OCB, .doit = nl80211_leave_ocb, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11443,7 +11618,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_WOWLAN, .doit = nl80211_set_wowlan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11452,7 +11627,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_REKEY_OFFLOAD, .doit = nl80211_set_rekey_data, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, @@ -11461,7 +11636,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TDLS_MGMT, .doit = nl80211_tdls_mgmt, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11469,7 +11644,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TDLS_OPER, .doit = nl80211_tdls_oper, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11477,7 +11652,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_UNEXPECTED_FRAME, .doit = nl80211_register_unexpected_frame, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11485,7 +11660,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_PROBE_CLIENT, .doit = nl80211_probe_client, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11493,7 +11668,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_REGISTER_BEACONS, .doit = nl80211_register_beacons, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11501,7 +11676,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_NOACK_MAP, .doit = nl80211_set_noack_map, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11509,7 +11684,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_START_P2P_DEVICE, .doit = nl80211_start_p2p_device, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11517,7 +11692,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_STOP_P2P_DEVICE, .doit = nl80211_stop_p2p_device, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11525,7 +11700,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_MCAST_RATE, .doit = nl80211_set_mcast_rate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11533,7 +11708,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_MAC_ACL, .doit = nl80211_set_mac_acl, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11541,7 +11716,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_RADAR_DETECT, .doit = nl80211_start_radar_detection, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11554,7 +11729,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_UPDATE_FT_IES, .doit = nl80211_update_ft_ies, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11562,7 +11737,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CRIT_PROTOCOL_START, .doit = nl80211_crit_protocol_start, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11570,7 +11745,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP, .doit = nl80211_crit_protocol_stop, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11585,7 +11760,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_COALESCE, .doit = nl80211_set_coalesce, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11593,7 +11768,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CHANNEL_SWITCH, .doit = nl80211_channel_switch, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11602,7 +11777,7 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_vendor_cmd, .dumpit = nl80211_vendor_cmd_dump, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11610,7 +11785,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_QOS_MAP, .doit = nl80211_set_qos_map, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11618,7 +11793,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_ADD_TX_TS, .doit = nl80211_add_tx_ts, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11626,7 +11801,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_TX_TS, .doit = nl80211_del_tx_ts, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11634,7 +11809,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH, .doit = nl80211_tdls_channel_switch, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11642,7 +11817,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, .doit = nl80211_tdls_cancel_channel_switch, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11708,6 +11883,13 @@ static int nl80211_add_scan_req(struct sk_buff *msg, nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags)) goto nla_put_failure; + if (req->info.scan_start_tsf && + (nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF, + req->info.scan_start_tsf, NL80211_BSS_PAD) || + nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN, + req->info.tsf_bssid))) + goto nla_put_failure; + return 0; nla_put_failure: return -ENOBUFS; @@ -12092,7 +12274,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - u16 status, gfp_t gfp) + int status, gfp_t gfp) { struct sk_buff *msg; void *hdr; @@ -12110,7 +12292,10 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || (bssid && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) || - nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, status) || + nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, + status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : + status) || + (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) || (req_ie && nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) || (resp_ie && @@ -12126,7 +12311,6 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, nla_put_failure: genlmsg_cancel(msg, hdr); nlmsg_free(msg); - } void nl80211_send_roamed(struct cfg80211_registered_device *rdev, @@ -12165,7 +12349,6 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, nla_put_failure: genlmsg_cancel(msg, hdr); nlmsg_free(msg); - } void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, @@ -12203,7 +12386,6 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, nla_put_failure: genlmsg_cancel(msg, hdr); nlmsg_free(msg); - } void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, @@ -13545,7 +13727,6 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) if (hdr) genlmsg_cancel(msg, hdr); nlmsg_free(msg); - } EXPORT_SYMBOL(cfg80211_crit_proto_stopped); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 84d4edf1d..a63f402b1 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -55,7 +55,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - u16 status, gfp_t gfp); + int status, gfp_t gfp); void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ef2955c89..0358e12be 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3,6 +3,7 @@ * * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2016 Intel Deutschland GmbH */ #include #include @@ -194,7 +195,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); - if (!request->aborted && + if (!request->info.aborted && request->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); @@ -202,10 +203,10 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, spin_unlock_bh(&rdev->bss_lock); } - msg = nl80211_build_scan_msg(rdev, wdev, request->aborted); + msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted); #ifdef CONFIG_CFG80211_WEXT - if (wdev->netdev && !request->aborted) { + if (wdev->netdev && !request->info.aborted) { memset(&wrqu, 0, sizeof(wrqu)); wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL); @@ -236,12 +237,13 @@ void __cfg80211_scan_done(struct work_struct *wk) rtnl_unlock(); } -void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted) +void cfg80211_scan_done(struct cfg80211_scan_request *request, + struct cfg80211_scan_info *info) { - trace_cfg80211_scan_done(request, aborted); + trace_cfg80211_scan_done(request, info); WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req); - request->aborted = aborted; + request->info = *info; request->notified = true; queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } @@ -843,6 +845,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, found->pub.capability = tmp->pub.capability; found->ts = tmp->ts; found->ts_boottime = tmp->ts_boottime; + found->parent_tsf = tmp->parent_tsf; + ether_addr_copy(found->parent_bssid, tmp->parent_bssid); } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; @@ -1086,6 +1090,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); tmp.ts_boottime = data->boottime_ns; + tmp.parent_tsf = data->parent_tsf; + ether_addr_copy(tmp.parent_bssid, data->parent_bssid); signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 584fdc347..add6824c4 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -244,9 +244,7 @@ void cfg80211_conn_work(struct work_struct *work) if (cfg80211_conn_do_work(wdev)) { __cfg80211_connect_result( wdev->netdev, bssid, - NULL, 0, NULL, 0, - WLAN_STATUS_UNSPECIFIED_FAILURE, - false, NULL); + NULL, 0, NULL, 0, -1, false, NULL); } wdev_unlock(wdev); } @@ -648,7 +646,7 @@ static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - u16 status, bool wextev, + int status, bool wextev, struct cfg80211_bss *bss) { struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -757,7 +755,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, struct cfg80211_bss *bss, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, - size_t resp_ie_len, u16 status, gfp_t gfp) + size_t resp_ie_len, int status, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 3c1091ae6..72b5255ce 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2642,8 +2642,9 @@ TRACE_EVENT(cfg80211_tdls_oper_request, ); TRACE_EVENT(cfg80211_scan_done, - TP_PROTO(struct cfg80211_scan_request *request, bool aborted), - TP_ARGS(request, aborted), + TP_PROTO(struct cfg80211_scan_request *request, + struct cfg80211_scan_info *info), + TP_ARGS(request, info), TP_STRUCT__entry( __field(u32, n_channels) __dynamic_array(u8, ie, request ? request->ie_len : 0) @@ -2652,6 +2653,8 @@ TRACE_EVENT(cfg80211_scan_done, MAC_ENTRY(wiphy_mac) __field(bool, no_cck) __field(bool, aborted) + __field(u64, scan_start_tsf) + MAC_ENTRY(tsf_bssid) ), TP_fast_assign( if (request) { @@ -2666,9 +2669,16 @@ TRACE_EVENT(cfg80211_scan_done, request->wiphy->perm_addr); __entry->no_cck = request->no_cck; } - __entry->aborted = aborted; + if (info) { + __entry->aborted = info->aborted; + __entry->scan_start_tsf = info->scan_start_tsf; + MAC_ASSIGN(tsf_bssid, info->tsf_bssid); + } ), - TP_printk("aborted: %s", BOOL_TO_STR(__entry->aborted)) + TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: " MAC_PR_FMT, + BOOL_TO_STR(__entry->aborted), + (unsigned long long)__entry->scan_start_tsf, + MAC_PR_ARG(tsf_bssid)) ); DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_results, @@ -2721,6 +2731,8 @@ TRACE_EVENT(cfg80211_inform_bss_frame, __dynamic_array(u8, mgmt, len) __field(s32, signal) __field(u64, ts_boottime) + __field(u64, parent_tsf) + MAC_ENTRY(parent_bssid) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2730,10 +2742,15 @@ TRACE_EVENT(cfg80211_inform_bss_frame, memcpy(__get_dynamic_array(mgmt), mgmt, len); __entry->signal = data->signal; __entry->ts_boottime = data->boottime_ns; - ), - TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d, tsb:%llu", - WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, - __entry->signal, (unsigned long long)__entry->ts_boottime) + __entry->parent_tsf = data->parent_tsf; + MAC_ASSIGN(parent_bssid, data->parent_bssid); + ), + TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT + "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: " + MAC_PR_FMT, WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, + __entry->signal, (unsigned long long)__entry->ts_boottime, + (unsigned long long)__entry->parent_tsf, + MAC_PR_ARG(parent_bssid)) ); DECLARE_EVENT_CLASS(cfg80211_bss_evt, diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 1c4ad477c..6e3f0254d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -207,15 +207,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) family = XFRM_SPI_SKB_CB(skb)->family; /* if tunnel is present override skb->mark value with tunnel i_key */ - if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) { - switch (family) { - case AF_INET: + switch (family) { + case AF_INET: + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); - break; - case AF_INET6: + break; + case AF_INET6: + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); - break; - } + break; } /* Allocate new secpath or COW existing one. */ diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b5e665b3c..45f9cf97e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -626,6 +626,10 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + if (xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { + /* skip socket policies */ + continue; + } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, policy->family, diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9895a8c56..a30f898dc 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -332,6 +332,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { tasklet_hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); + kfree(x->aead); kfree(x->aalg); kfree(x->ealg); kfree(x->calg); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d516845e1..08892091c 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -581,9 +581,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (err) goto error; - if (attrs[XFRMA_SEC_CTX] && - security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX]))) - goto error; + if (attrs[XFRMA_SEC_CTX]) { + err = security_xfrm_state_alloc(x, + nla_data(attrs[XFRMA_SEC_CTX])); + if (err) + goto error; + } if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) @@ -896,7 +899,8 @@ static int xfrm_dump_sa_done(struct netlink_callback *cb) struct sock *sk = cb->skb->sk; struct net *net = sock_net(sk); - xfrm_state_walk_done(walk, net); + if (cb->args[0]) + xfrm_state_walk_done(walk, net); return 0; } @@ -921,8 +925,6 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) u8 proto = 0; int err; - cb->args[0] = 1; - err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy); if (err < 0) @@ -939,6 +941,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) proto = nla_get_u8(attrs[XFRMA_PROTO]); xfrm_state_walk_init(walk, proto, filter); + cb->args[0] = 1; } (void) xfrm_state_walk(net, walk, dump_one_state, &info); @@ -2051,9 +2054,6 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (up->hard) { xfrm_policy_delete(xp, p->dir); xfrm_audit_policy_delete(xp, 1, true); - } else { - // reset the timers here? - WARN(1, "Don't know what to do with soft policy expire\n"); } km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid); @@ -2117,7 +2117,7 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, err = verify_newpolicy_info(&ua->policy); if (err) - goto bad_policy; + goto free_state; /* build an XP */ xp = xfrm_policy_construct(net, &ua->policy, attrs, &err); @@ -2149,8 +2149,6 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; -bad_policy: - WARN(1, "BAD policy passed\n"); free_state: kfree(x); nomem: -- cgit v1.2.3